Total coverage: 352231 (20%)of 1836385
12 232 13 17 20 17 16 16 1 1 1 1 1 1 5 1 1 1 22 22 22 22 21 1 1 1 1 1 3 17 44 44 49 49 10 10 6 7 7 16 16 16 7 17 16 7 2 13 1 1 5 5 8 8 1 7 19 18 20 20 19 3 246 244 245 247 246 3 1 2 3 3 17 17 17 17 1 1 1 20 19 19 19 2 20 8 19 19 19 5 19 19 19 7 17 20 20 7 2 17 20 20 1 3 6 17 20 18 17 18 2 20 17 3 1 10 2 16 210 212 212 210 210 16 16 212 194 16 16 16 232 215 12 2 5 5 5 234 145 145 232 232 232 193 12 4 212 232 232 232 197 197 196 158 232 17 17 17 17 12 244 245 1 1 1 1 1 1 1 18 40 40 40 40 14 15 15 14 15 15 15 15 16 16 35 35 1 1 1 1 1 1 1 1 1 1 1 1 33 32 33 33 1 1 1 1 16 1 15 1 52 53 53 18 18 18 18 18 2 64 67 67 33 32 1 3 3 2 2 3 3 3 3 3 3 1 3 3 2 42 39 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * MMU support * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "irq.h" #include "ioapic.h" #include "mmu.h" #include "mmu_internal.h" #include "tdp_mmu.h" #include "x86.h" #include "kvm_cache_regs.h" #include "smm.h" #include "kvm_emulate.h" #include "page_track.h" #include "cpuid.h" #include "spte.h" #include <linux/kvm_host.h> #include <linux/types.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> #include <linux/srcu.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> #include <linux/kstrtox.h> #include <linux/kthread.h> #include <linux/wordpart.h> #include <asm/page.h> #include <asm/memtype.h> #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/set_memory.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> #include "trace.h" static bool nx_hugepage_mitigation_hard_disabled; int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; #else static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { .set = set_nx_huge_pages_recovery_param, .get = param_get_uint, }; module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); __MODULE_PARM_TYPE(nx_huge_pages, "bool"); module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_period_ms, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); static bool __read_mostly force_flush_and_sync_on_reuse; module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: * 1. the guest-virtual to guest-physical * 2. while doing 1. it walks guest-physical to host-physical * If the hardware supports that we don't need to do shadow paging. */ bool tdp_enabled = false; static bool __ro_after_init tdp_mmu_allowed; #ifdef CONFIG_X86_64 bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444); #endif static int max_huge_page_level __read_mostly; static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; #define PTE_PREFETCH_NUM 8 #include <trace/events/kvm.h> /* make pte_list_desc fit well in cache lines */ #define PTE_LIST_EXT 14 /* * struct pte_list_desc is the core data structure used to implement a custom * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a * given GFN when used in the context of rmaps. Using a custom list allows KVM * to optimize for the common case where many GFNs will have at most a handful * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small * memory footprint, which in turn improves runtime performance by exploiting * cache locality. * * A list is comprised of one or more pte_list_desc objects (descriptors). * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor * is full and a new SPTEs needs to be added, a new descriptor is allocated and * becomes the head of the list. This means that by definitions, all tail * descriptors are full. * * Note, the meta data fields are deliberately placed at the start of the * structure to optimize the cacheline layout; accessing the descriptor will * touch only a single cacheline so long as @spte_count<=6 (or if only the * descriptors metadata is accessed). */ struct pte_list_desc { struct pte_list_desc *more; /* The number of PTEs stored in _this_ descriptor. */ u32 spte_count; /* The number of PTEs stored in all tails of this descriptor. */ u32 tail_count; u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { u64 addr; hpa_t shadow_addr; u64 *sptep; int level; unsigned index; }; #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ (_root), (_addr)); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) #define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)) && \ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ __shadow_walk_next(&(_walker), spte)) static struct kmem_cache *pte_list_desc_cache; struct kmem_cache *mmu_page_header_cache; static void mmu_spte_set(u64 *sptep, u64 spte); struct kvm_mmu_role_regs { const unsigned long cr0; const unsigned long cr4; const u64 efer; }; #define CREATE_TRACE_POINTS #include "mmutrace.h" /* * Yes, lot's of underscores. They're a hint that you probably shouldn't be * reading from the role_regs. Once the root_role is constructed, it becomes * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ static inline bool __maybe_unused \ ____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \ { \ return !!(regs->reg & flag); \ } BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); /* * The MMU itself (with a valid role) is the single source of truth for the * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \ } BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma); static inline bool is_cr0_pg(struct kvm_mmu *mmu) { return mmu->cpu_role.base.level > 0; } static inline bool is_cr4_pae(struct kvm_mmu *mmu) { return !mmu->cpu_role.base.has_4_byte_gpte; } static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = { .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), .efer = vcpu->arch.efer, }; return regs; } static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3) return kvm_read_cr3(vcpu); return mmu->get_guest_pgd(vcpu); } static inline bool kvm_available_flush_remote_tlbs_range(void) { #if IS_ENABLED(CONFIG_HYPERV) return kvm_x86_ops.flush_remote_tlbs_range; #else return false; #endif } static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index); /* Flush the range of guest memory mapped by the given SPTE. */ static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep)); kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned int access) { u64 spte = make_mmio_spte(vcpu, gfn, access); trace_mark_mmio_spte(sptep, gfn, spte); mmu_spte_set(sptep, spte); } static gfn_t get_mmio_spte_gfn(u64 spte) { u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) & shadow_nonpresent_or_rsvd_mask; return gpa >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { return spte & shadow_mmio_access_mask; } static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { u64 kvm_gen, spte_gen, gen; gen = kvm_vcpu_memslots(vcpu)->generation; if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) return false; kvm_gen = gen & MMIO_SPTE_GEN_MASK; spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); return likely(kvm_gen == spte_gen); } static int is_cpuid_PSE36(void) { return 1; } #ifdef CONFIG_X86_64 static void __set_spte(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); WRITE_ONCE(*sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { KVM_MMU_WARN_ON(is_ept_ve_possible(spte)); return xchg(sptep, spte); } static u64 __get_spte_lockless(u64 *sptep) { return READ_ONCE(*sptep); } #else union split_spte { struct { u32 spte_low; u32 spte_high; }; u64 spte; }; static void count_spte_clear(u64 *sptep, u64 spte) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; /* Ensure the spte is completely set before we increase the count */ smp_wmb(); sp->clear_spte_count++; } static void __set_spte(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; ssptep->spte_high = sspte.spte_high; /* * If we map the spte from nonpresent to present, We should store * the high bits firstly, then set present bit, so cpu can not * fetch this spte while we are setting the spte. */ smp_wmb(); WRITE_ONCE(ssptep->spte_low, sspte.spte_low); } static void __update_clear_spte_fast(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; WRITE_ONCE(ssptep->spte_low, sspte.spte_low); /* * If we map the spte from present to nonpresent, we should clear * present bit firstly to avoid vcpu fetch the old high bits. */ smp_wmb(); ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); } static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) { union split_spte *ssptep, sspte, orig; ssptep = (union split_spte *)sptep; sspte = (union split_spte)spte; /* xchg acts as a barrier before the setting of the high bits */ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); orig.spte_high = ssptep->spte_high; ssptep->spte_high = sspte.spte_high; count_spte_clear(sptep, spte); return orig.spte; } /* * The idea using the light way get the spte on x86_32 guest is from * gup_get_pte (mm/gup.c). * * An spte tlb flush may be pending, because they are coalesced and * we are running out of the MMU lock. Therefore * we need to protect against in-progress updates of the spte. * * Reading the spte while an update is in progress may get the old value * for the high part of the spte. The race is fine for a present->non-present * change (because the high part of the spte is ignored for non-present spte), * but for a present->present change we must reread the spte. * * All such changes are done in two steps (present->non-present and * non-present->present), hence it is enough to count the number of * present->non-present updates: if it changed while reading the spte, * we might have hit the race. This is done using clear_spte_count. */ static u64 __get_spte_lockless(u64 *sptep) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; retry: count = sp->clear_spte_count; smp_rmb(); spte.spte_low = orig->spte_low; smp_rmb(); spte.spte_high = orig->spte_high; smp_rmb(); if (unlikely(spte.spte_low != orig->spte_low || count != sp->clear_spte_count)) goto retry; return spte.spte; } #endif /* Rules for using mmu_spte_set: * Set the sptep from nonpresent to present. * Note: the sptep being assigned *must* be either not present * or in a state where the hardware will not attempt to update * the spte. */ static void mmu_spte_set(u64 *sptep, u64 new_spte) { WARN_ON_ONCE(is_shadow_present_pte(*sptep)); __set_spte(sptep, new_spte); } /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changed. * * Returns true if the TLB needs to be flushed */ static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; WARN_ON_ONCE(!is_shadow_present_pte(new_spte)); check_spte_writable_invariants(new_spte); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); return false; } if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); WARN_ON_ONCE(!is_shadow_present_pte(old_spte) || spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); return leaf_spte_change_needs_tlb_flush(old_spte, new_spte); } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { u64 old_spte = *sptep; int level = sptep_to_sp(sptep)->role.level; if (!is_shadow_present_pte(old_spte) || !spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); else old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE); if (!is_shadow_present_pte(old_spte)) return old_spte; kvm_update_page_stats(kvm, level, -1); return old_spte; } /* * Rules for using mmu_spte_clear_no_track: * Directly clear spte without caring the state bits of sptep, * it is used to set the upper level spte. */ static void mmu_spte_clear_no_track(u64 *sptep) { __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE); } static u64 mmu_spte_get_lockless(u64 *sptep) { return __get_spte_lockless(sptep); } static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu) { return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct; } static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_begin(); } else { /* * Prevent page table teardown by making any free-er wait during * kvm_flush_remote_tlbs() IPI to all active vcpus. */ local_irq_disable(); /* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode. */ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { if (is_tdp_mmu_active(vcpu)) { kvm_tdp_mmu_walk_lockless_end(); } else { /* * Make sure the write to vcpu->mode is not reordered in front of * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) return r; if (kvm_has_mirrored_tdp(vcpu->kvm)) { r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_external_spt_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; } r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; if (maybe_indirect) { r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, PT64_ROOT_MAX_LEVEL); if (r) return r; } return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_external_spt_cache); kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) { kmem_cache_free(pte_list_desc_cache, pte_list_desc); } static bool sp_has_gptes(struct kvm_mmu_page *sp); static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) { if (sp->role.passthrough) return sp->gfn; if (sp->shadowed_translation) return sp->shadowed_translation[index] >> PAGE_SHIFT; return sp->gfn + (index << ((sp->role.level - 1) * SPTE_LEVEL_BITS)); } /* * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note * that the SPTE itself may have a more constrained access permissions that * what the guest enforces. For example, a guest may create an executable * huge PTE but KVM may disallow execution to mitigate iTLB multihit. */ static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index) { if (sp->shadowed_translation) return sp->shadowed_translation[index] & ACC_ALL; /* * For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs, * KVM is not shadowing any guest page tables, so the "guest access * permissions" are just ACC_ALL. * * For direct SPs in indirect MMUs (shadow paging), i.e. when KVM * is shadowing a guest huge page with small pages, the guest access * permissions being shadowed are the access permissions of the huge * page. * * In both cases, sp->role.access contains the correct access bits. */ return sp->role.access; } static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index, gfn_t gfn, unsigned int access) { if (sp->shadowed_translation) { sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access; return; } WARN_ONCE(access != kvm_mmu_page_get_access(sp, index), "access mismatch under %s page %llx (expected %u, got %u)\n", sp->role.passthrough ? "passthrough" : "direct", sp->gfn, kvm_mmu_page_get_access(sp, index), access); WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index), "gfn mismatch under %s page %llx (expected %llx, got %llx)\n", sp->role.passthrough ? "passthrough" : "direct", sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn); } static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, unsigned int access) { gfn_t gfn = kvm_mmu_page_get_gfn(sp, index); kvm_mmu_page_set_translation(sp, index, gfn, access); } /* * Return the pointer to the large page information for a given gfn, * handling slots that are not large page aligned. */ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, const struct kvm_memory_slot *slot, int level) { unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->arch.lpage_info[level - 2][idx]; } /* * The most significant bit in disallow_lpage tracks whether or not memory * attributes are mixed, i.e. not identical for all gfns at the current level. * The lower order bits are used to refcount other cases where a hugepage is * disallowed, e.g. if KVM has shadow a page table at the gfn. */ #define KVM_LPAGE_MIXED_FLAG BIT(31) static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); old = linfo->disallow_lpage; linfo->disallow_lpage += count; WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; gfn_t gfn; kvm->arch.indirect_shadow_pages++; /* * Ensure indirect_shadow_pages is elevated prior to re-reading guest * child PTEs in FNAME(gpte_changed), i.e. guarantee either in-flight * emulated writes are visible before re-reading guest PTEs, or that * an emulated write will see the elevated count and acquire mmu_lock * to update SPTEs. Pairs with the smp_mb() in kvm_mmu_track_write(). */ smp_mb(); gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); /* the non-leaf shadow pages are keeping readonly. */ if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_add_gfn(kvm, slot, gfn); kvm_mmu_gfn_disallow_lpage(slot, gfn); if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K)) kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K); } void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { /* * If it's possible to replace the shadow page with an NX huge page, * i.e. if the shadow page is the only thing currently preventing KVM * from using a huge page, add the shadow page to the list of "to be * zapped for NX recovery" pages. Note, the shadow page can already be * on the list if KVM is reusing an existing shadow page, i.e. if KVM * links a shadow page at multiple points. */ if (!list_empty(&sp->possible_nx_huge_page_link)) return; ++kvm->stat.nx_lpage_splits; list_add_tail(&sp->possible_nx_huge_page_link, &kvm->arch.possible_nx_huge_pages); } static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp, bool nx_huge_page_possible) { sp->nx_huge_page_disallowed = true; if (nx_huge_page_possible) track_possible_nx_huge_page(kvm, sp); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; gfn_t gfn; kvm->arch.indirect_shadow_pages--; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); if (sp->role.level > PG_LEVEL_4K) return __kvm_write_track_remove_gfn(kvm, slot, gfn); kvm_mmu_gfn_allow_lpage(slot, gfn); } void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { if (list_empty(&sp->possible_nx_huge_page_link)) return; --kvm->stat.nx_lpage_splits; list_del_init(&sp->possible_nx_huge_page_link); } static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp) { sp->nx_huge_page_disallowed = false; untrack_possible_nx_huge_page(kvm, sp); } static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return NULL; if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) return NULL; return slot; } /* * About rmap_head encoding: * * If the bit zero of rmap_head->val is clear, then it points to the only spte * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct * pte_list_desc containing more mappings. */ #define KVM_RMAP_MANY BIT(0) /* * Returns the number of pointers in the rmap chain, not counting the new one. */ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int count = 0; if (!rmap_head->val) { rmap_head->val = (unsigned long)spte; } else if (!(rmap_head->val & KVM_RMAP_MANY)) { desc = kvm_mmu_memory_cache_alloc(cache); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; desc->spte_count = 2; desc->tail_count = 0; rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; ++count; } else { desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); count = desc->tail_count + desc->spte_count; /* * If the previous head is full, allocate a new head descriptor * as tail descriptors are always kept full. */ if (desc->spte_count == PTE_LIST_EXT) { desc = kvm_mmu_memory_cache_alloc(cache); desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); desc->spte_count = 0; desc->tail_count = count; rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY; } desc->sptes[desc->spte_count++] = spte; } return count; } static void pte_list_desc_remove_entry(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i) { struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); int j = head_desc->spte_count - 1; /* * The head descriptor should never be empty. A new head is added only * when adding an entry and the previous head is full, and heads are * removed (this flow) when they become empty. */ KVM_BUG_ON_DATA_CORRUPTION(j < 0, kvm); /* * Replace the to-be-freed SPTE with the last valid entry from the head * descriptor to ensure that tail descriptors are full at all times. * Note, this also means that tail_count is stable for each descriptor. */ desc->sptes[i] = head_desc->sptes[j]; head_desc->sptes[j] = NULL; head_desc->spte_count--; if (head_desc->spte_count) return; /* * The head descriptor is empty. If there are no tail descriptors, * nullify the rmap head to mark the list as empty, else point the rmap * head at the next descriptor, i.e. the new head. */ if (!head_desc->more) rmap_head->val = 0; else rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY; mmu_free_pte_list_desc(head_desc); } static void pte_list_remove(struct kvm *kvm, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; int i; if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm)) return; if (!(rmap_head->val & KVM_RMAP_MANY)) { if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm)) return; rmap_head->val = 0; } else { desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); while (desc) { for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(kvm, rmap_head, desc, i); return; } } desc = desc->more; } KVM_BUG_ON_DATA_CORRUPTION(true, kvm); } } static void kvm_zap_one_rmap_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, u64 *sptep) { mmu_spte_clear_track_bits(kvm, sptep); pte_list_remove(kvm, sptep, rmap_head); } /* Return true if at least one SPTE was zapped, false otherwise */ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc, *next; int i; if (!rmap_head->val) return false; if (!(rmap_head->val & KVM_RMAP_MANY)) { mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); goto out; } desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); for (; desc; desc = next) { for (i = 0; i < desc->spte_count; i++) mmu_spte_clear_track_bits(kvm, desc->sptes[i]); next = desc->more; mmu_free_pte_list_desc(desc); } out: /* rmap_head is meaningless now, remember to reset it */ rmap_head->val = 0; return true; } unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; if (!rmap_head->val) return 0; else if (!(rmap_head->val & KVM_RMAP_MANY)) return 1; desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); return desc->tail_count + desc->spte_count; } static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, const struct kvm_memory_slot *slot) { unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte_index(spte)); /* * Unlike rmap_add, rmap_remove does not run in the context of a vCPU * so we have to determine which memslots to use based on context * information in sp->role. */ slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); pte_list_remove(kvm, spte, rmap_head); } /* * Used by the following functions to iterate through the sptes linked by a * rmap. All fields are private and not assumed to be used outside. */ struct rmap_iterator { /* private fields */ struct pte_list_desc *desc; /* holds the sptep if not NULL */ int pos; /* index of the sptep */ }; /* * Iteration must be started by this function. This should also be used after * removing/dropping sptes from the rmap link because in such cases the * information in the iterator may not be valid. * * Returns sptep if found, NULL otherwise. */ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, struct rmap_iterator *iter) { u64 *sptep; if (!rmap_head->val) return NULL; if (!(rmap_head->val & KVM_RMAP_MANY)) { iter->desc = NULL; sptep = (u64 *)rmap_head->val; goto out; } iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY); iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: BUG_ON(!is_shadow_present_pte(*sptep)); return sptep; } /* * Must be used with a valid iterator: e.g. after rmap_get_first(). * * Returns sptep if found, NULL otherwise. */ static u64 *rmap_get_next(struct rmap_iterator *iter) { u64 *sptep; if (iter->desc) { if (iter->pos < PTE_LIST_EXT - 1) { ++iter->pos; sptep = iter->desc->sptes[iter->pos]; if (sptep) goto out; } iter->desc = iter->desc->more; if (iter->desc) { iter->pos = 0; /* desc->sptes[0] cannot be NULL */ sptep = iter->desc->sptes[iter->pos]; goto out; } } return NULL; out: BUG_ON(!is_shadow_present_pte(*sptep)); return sptep; } #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ _spte_; _spte_ = rmap_get_next(_iter_)) static void drop_spte(struct kvm *kvm, u64 *sptep) { u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush) { struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); if (flush) kvm_flush_remote_tlbs_sptep(kvm, sptep); } /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte write-protection is caused by protecting shadow page table. * * Note: write protection is difference between dirty logging and spte * protection: * - for dirty logging, the spte can be set to writable at anytime if * its dirty bitmap is properly set. * - for spte protection, the spte can be writable only after unsync-ing * shadow page. * * Return true if tlb need be flushed. */ static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; if (!is_writable_pte(spte) && !(pt_protect && is_mmu_writable_spte(spte))) return false; if (pt_protect) spte &= ~shadow_mmu_writable_mask; spte = spte & ~PT_WRITABLE_MASK; return mmu_spte_update(sptep, spte); } static bool rmap_write_protect(struct kvm_rmap_head *rmap_head, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) flush |= spte_write_protect(sptep, pt_protect); return flush; } static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; KVM_MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; return mmu_spte_update(sptep, spte); } /* * Gets the GFN ready for another round of dirty logging by clearing the * - D bit on ad-enabled SPTEs, and * - W bit on ad-disabled SPTEs. * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) if (spte_ad_need_write_protect(*sptep)) flush |= test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); else flush |= spte_clear_dirty(sptep); return flush; } static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, true); if (!kvm_memslots_have_rmaps(kvm)) return; while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); rmap_write_protect(rmap_head, false); /* clear the first set bit */ mask &= mask - 1; } } static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { struct kvm_rmap_head *rmap_head; if (tdp_mmu_enabled) kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, slot->base_gfn + gfn_offset, mask, false); if (!kvm_memslots_have_rmaps(kvm)) return; while (mask) { rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ mask &= mask - 1; } } void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { /* * If the slot was assumed to be "initially all dirty", write-protect * huge pages to ensure they are split to 4KiB on the first write (KVM * dirty logs at 4KiB granularity). If eager page splitting is enabled, * immediately try to split huge pages, e.g. so that vCPUs don't get * saddled with the cost of splitting. * * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn * of memslot has no such restriction, so the range can cross two large * pages. */ if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); if (READ_ONCE(eager_page_split)) kvm_mmu_try_split_huge_pages(kvm, slot, start, end + 1, PG_LEVEL_4K); kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); /* Cross two large pages? */ if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != ALIGN(end << PAGE_SHIFT, PMD_SIZE)) kvm_mmu_slot_gfn_write_protect(kvm, slot, end, PG_LEVEL_2M); } /* * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in * mask. If PML is enabled and the GFN doesn't need to be write- * protected for other reasons, e.g. shadow paging, clear the Dirty bit. * Otherwise clear the Writable bit. * * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is * enabled but it chooses between clearing the Dirty bit and Writeable * bit based on the context. */ if (kvm_x86_ops.cpu_dirty_log_size) kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } int kvm_cpu_dirty_log_size(void) { return kvm_x86_ops.cpu_dirty_log_size; } bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level) { struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= rmap_write_protect(rmap_head, true); } } if (tdp_mmu_enabled) write_protected |= kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); return write_protected; } static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); } static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { return kvm_zap_all_rmap_sptes(kvm, rmap_head); } struct slot_rmap_walk_iterator { /* input fields. */ const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; int end_level; /* output fields. */ gfn_t gfn; struct kvm_rmap_head *rmap; int level; /* private field. */ struct kvm_rmap_head *end_rmap; }; static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; iterator->start_level = start_level; iterator->end_level = end_level; iterator->start_gfn = start_gfn; iterator->end_gfn = end_gfn; rmap_walk_init_level(iterator, iterator->start_level); } static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) { return !!iterator->rmap; } static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) { while (++iterator->rmap <= iterator->end_rmap) { iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level); if (iterator->rmap->val) return; } if (++iterator->level > iterator->end_level) { iterator->rmap = NULL; return; } rmap_walk_init_level(iterator, iterator->level); } #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ _start_gfn, _end_gfn, _iter_) \ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ _end_level_, _start_gfn, _end_gfn); \ slot_rmap_walk_okay(_iter_); \ slot_rmap_walk_next(_iter_)) /* The return value indicates if tlb flush on all vcpus is needed. */ typedef bool (*slot_rmaps_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot); static __always_inline bool __walk_slot_rmaps(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool can_yield, bool flush_on_yield, bool flush) { struct slot_rmap_walk_iterator iterator; lockdep_assert_held_write(&kvm->mmu_lock); for_each_slot_rmap_range(slot, start_level, end_level, start_gfn, end_gfn, &iterator) { if (iterator.rmap) flush |= fn(kvm, iterator.rmap, slot); if (!can_yield) continue; if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { if (flush && flush_on_yield) { kvm_flush_remote_tlbs_range(kvm, start_gfn, iterator.gfn - start_gfn + 1); flush = false; } cond_resched_rwlock_write(&kvm->mmu_lock); } } return flush; } static __always_inline bool walk_slot_rmaps(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, int start_level, int end_level, bool flush_on_yield) { return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level, slot->base_gfn, slot->base_gfn + slot->npages - 1, true, flush_on_yield, false); } static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm, const struct kvm_memory_slot *slot, slot_rmaps_handler fn, bool flush_on_yield) { return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); } static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, bool can_yield, bool flush) { return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, can_yield, true, flush); } bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; /* * To prevent races with vCPUs faulting in a gfn using stale data, * zapping a gfn range must be protected by mmu_invalidate_in_progress * (and mmu_invalidate_seq). The only exception is memslot deletion; * in that case, SRCU synchronization ensures that SPTEs are zapped * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the * invalid slot. */ lockdep_assert_once(kvm->mmu_invalidate_in_progress || lockdep_is_held(&kvm->slots_lock)); if (kvm_memslots_have_rmaps(kvm)) flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, range->start, range->end, range->may_block, flush); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); if (kvm_x86_ops.set_apic_access_page_addr && range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); return flush; } #define RMAP_RECYCLE_THRESHOLD 1000 static void __rmap_add(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, const struct kvm_memory_slot *slot, u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; int rmap_count; sp = sptep_to_sp(spte); kvm_mmu_page_set_translation(sp, spte_index(spte), gfn, access); kvm_update_page_stats(kvm, sp->role.level, 1); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); rmap_count = pte_list_add(cache, spte, rmap_head); if (rmap_count > kvm->stat.max_mmu_rmap_size) kvm->stat.max_mmu_rmap_size = rmap_count; if (rmap_count > RMAP_RECYCLE_THRESHOLD) { kvm_zap_all_rmap_sptes(kvm, rmap_head); kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level); } } static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot, u64 *spte, gfn_t gfn, unsigned int access) { struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache; __rmap_add(vcpu->kvm, cache, slot, spte, gfn, access); } static bool kvm_rmap_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool test_only) { struct slot_rmap_walk_iterator iterator; struct rmap_iterator iter; bool young = false; u64 *sptep; for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, range->start, range->end - 1, &iterator) { for_each_rmap_spte(iterator.rmap, &iter, sptep) { u64 spte = *sptep; if (!is_accessed_spte(spte)) continue; if (test_only) return true; if (spte_ad_enabled(spte)) { clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } else { /* * WARN if mmu_spte_update() signals the need * for a TLB flush, as Access tracking a SPTE * should never trigger an _immediate_ flush. */ spte = mark_spte_for_access_track(spte); WARN_ON_ONCE(mmu_spte_update(sptep, spte)); } young = true; } } return young; } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) young = kvm_rmap_age_gfn_range(kvm, range, false); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_age_gfn_range(kvm, range); return young; } bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { bool young = false; if (kvm_memslots_have_rmaps(kvm)) young = kvm_rmap_age_gfn_range(kvm, range, true); if (tdp_mmu_enabled) young |= kvm_tdp_mmu_test_age_gfn(kvm, range); return young; } static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) { #ifdef CONFIG_KVM_PROVE_MMU int i; for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i]))) pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free", sp->spt[i], &sp->spt[i], kvm_mmu_page_get_gfn(sp, i)); } #endif } static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm->arch.n_used_mmu_pages++; kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { kvm->arch.n_used_mmu_pages--; kvm_account_pgtable_pages((void *)sp->spt, -1); } static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp) { kvm_mmu_check_sptes_at_free(sp); hlist_del(&sp->hash_link); list_del(&sp->link); free_page((unsigned long)sp->spt); free_page((unsigned long)sp->shadowed_translation); kmem_cache_free(mmu_page_header_cache, sp); } static unsigned kvm_page_table_hashfn(gfn_t gfn) { return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache, struct kvm_mmu_page *sp, u64 *parent_pte) { if (!parent_pte) return; pte_list_add(cache, parent_pte, &sp->parent_ptes); } static void mmu_page_remove_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { pte_list_remove(kvm, parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *parent_pte) { mmu_page_remove_parent_pte(kvm, sp, parent_pte); mmu_spte_clear_no_track(parent_pte); } static void mark_unsync(u64 *spte); static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { mark_unsync(sptep); } } static void mark_unsync(u64 *spte) { struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap)) return; if (sp->unsync_children++) return; kvm_mmu_mark_parents_unsync(sp); } #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { struct mmu_page_and_offset { struct kvm_mmu_page *sp; unsigned int idx; } page[KVM_PAGE_ARRAY_NR]; unsigned int nr; }; static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { int i; if (sp->unsync) for (i=0; i < pvec->nr; i++) if (pvec->page[i].sp == sp) return 0; pvec->page[pvec->nr].sp = sp; pvec->page[pvec->nr].idx = idx; pvec->nr++; return (pvec->nr == KVM_PAGE_ARRAY_NR); } static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) { --sp->unsync_children; WARN_ON_ONCE((int)sp->unsync_children < 0); __clear_bit(idx, sp->unsync_child_bitmap); } static int __mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { int i, ret, nr_unsync_leaf = 0; for_each_set_bit(i, sp->unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { clear_unsync_child_bit(sp, i); continue; } child = spte_to_child_sp(ent); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); if (!ret) { clear_unsync_child_bit(sp, i); continue; } else if (ret > 0) { nr_unsync_leaf += ret; } else return ret; } else if (child->unsync) { nr_unsync_leaf++; if (mmu_pages_add(pvec, child, i)) return -ENOSPC; } else clear_unsync_child_bit(sp, i); } return nr_unsync_leaf; } #define INVALID_INDEX (-1) static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { pvec->nr = 0; if (!sp->unsync_children) return 0; mmu_pages_add(pvec, sp, INVALID_INDEX); return __mmu_unsync_walk(sp, pvec); } static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON_ONCE(!sp->unsync); trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); static bool sp_has_gptes(struct kvm_mmu_page *sp) { if (sp->role.direct) return false; if (sp->role.passthrough) return false; return true; } #define for_each_valid_sp(_kvm, _sp, _list) \ hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ for_each_valid_sp(_kvm, _sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role; /* * Ignore various flags when verifying that it's safe to sync a shadow * page using the current MMU context. * * - level: not part of the overall MMU role and will never match as the MMU's * level tracks the root level * - access: updated based on the new guest PTE * - quadrant: not part of the overall MMU role (similar to level) */ const union kvm_mmu_page_role sync_role_ign = { .level = 0xf, .access = 0x7, .quadrant = 0x3, .passthrough = 0x1, }; /* * Direct pages can never be unsync, and KVM should never attempt to * sync a shadow page for a different MMU context, e.g. if the role * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the * reserved bits checks will be wrong, etc... */ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte || (sp->role.word ^ root_role.word) & ~sync_role_ign.word)) return false; return true; } static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { /* sp->spt[i] has initial value of shadow page table allocation */ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE) return 0; return vcpu->arch.mmu->sync_spte(vcpu, sp, i); } static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int flush = 0; int i; if (!kvm_sync_page_check(vcpu, sp)) return -1; for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { int ret = kvm_sync_spte(vcpu, sp, i); if (ret < -1) return -1; flush |= ret; } /* * Note, any flush is purely for KVM's correctness, e.g. when dropping * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier * unmap or dirty logging event doesn't fail to flush. The guest is * responsible for flushing the TLB to ensure any changes in protection * bits are recognized, i.e. until the guest flushes or page faults on * a relevant address, KVM is architecturally allowed to let vCPUs use * cached translations with the old protection bits. */ return flush; } static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int ret = __kvm_sync_page(vcpu, sp); if (ret < 0) kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, struct list_head *invalid_list, bool remote_flush) { if (!remote_flush && list_empty(invalid_list)) return false; if (!list_empty(invalid_list)) kvm_mmu_commit_zap_page(kvm, invalid_list); else kvm_flush_remote_tlbs(kvm); return true; } static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { if (sp->role.invalid) return true; /* TDP MMU pages do not use the MMU generation. */ return !is_tdp_mmu_page(sp) && unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); } struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; unsigned int idx[PT64_ROOT_MAX_LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ for (i = mmu_pages_first(&pvec, &parents); \ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) static int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, int i) { int n; for (n = i+1; n < pvec->nr; n++) { struct kvm_mmu_page *sp = pvec->page[n].sp; unsigned idx = pvec->page[n].idx; int level = sp->role.level; parents->idx[level-1] = idx; if (level == PG_LEVEL_4K) break; parents->parent[level-2] = sp; } return n; } static int mmu_pages_first(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents) { struct kvm_mmu_page *sp; int level; if (pvec->nr == 0) return 0; WARN_ON_ONCE(pvec->page[0].idx != INVALID_INDEX); sp = pvec->page[0].sp; level = sp->role.level; WARN_ON_ONCE(level == PG_LEVEL_4K); parents->parent[level-2] = sp; /* Also set up a sentinel. Further entries in pvec are all * children of sp, so this element is never overwritten. */ parents->parent[level-1] = NULL; return mmu_pages_next(pvec, parents, 0); } static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; unsigned int level = 0; do { unsigned int idx = parents->idx[level]; sp = parents->parent[level]; if (!sp) return; WARN_ON_ONCE(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; } while (!sp->unsync_children); } static int mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *parent, bool can_yield) { int i; struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; LIST_HEAD(invalid_list); bool flush = false; while (mmu_unsync_walk(parent, &pages)) { bool protected = false; for_each_sp(pages, sp, parents, i) protected |= kvm_vcpu_write_protect_gfn(vcpu, sp->gfn); if (protected) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); flush = false; } for_each_sp(pages, sp, parents, i) { kvm_unlink_unsync_page(vcpu->kvm, sp); flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0; mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); if (!can_yield) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); return -EINTR; } cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); flush = false; } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); return 0; } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { atomic_set(&sp->write_flooding_count, 0); } static void clear_sp_write_flooding_count(u64 *spte) { __clear_sp_write_flooding_count(sptep_to_sp(spte)); } /* * The vCPU is required when finding indirect shadow pages; the shadow * page may already exist and syncing it needs the vCPU pointer in * order to read guest page tables. Direct shadow pages are never * unsync, thus @vcpu can be NULL if @role.direct is true. */ static struct kvm_mmu_page *kvm_mmu_find_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; int ret; int collisions = 0; LIST_HEAD(invalid_list); for_each_valid_sp(kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; } if (sp->role.word != role.word) { /* * If the guest is creating an upper-level page, zap * unsync pages for the same gfn. While it's possible * the guest is using recursive page tables, in all * likelihood the guest has stopped using the unsync * page and is installing a completely unrelated page. * Unsync pages must not be left as is, because the new * upper-level page will be write-protected. */ if (role.level > PG_LEVEL_4K && sp->unsync) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); continue; } /* unsync and write-flooding only apply to indirect SPs. */ if (sp->role.direct) goto out; if (sp->unsync) { if (KVM_BUG_ON(!vcpu, kvm)) break; /* * The page is good, but is stale. kvm_sync_page does * get the latest guest state, but (unlike mmu_unsync_children) * it doesn't write-protect the page or mark it synchronized! * This way the validity of the mapping is ensured, but the * overhead of write protection is not incurred until the * guest invalidates the TLB mapping. This allows multiple * SPs for a single gfn to be unsync. * * If the sync fails, the page is zapped. If so, break * in order to rebuild it. */ ret = kvm_sync_page(vcpu, sp, &invalid_list); if (ret < 0) break; WARN_ON_ONCE(!list_empty(&invalid_list)); if (ret > 0) kvm_flush_remote_tlbs(kvm); } __clear_sp_write_flooding_count(sp); goto out; } sp = NULL; ++kvm->stat.mmu_cache_miss; out: kvm_mmu_commit_zap_page(kvm, &invalid_list); if (collisions > kvm->stat.max_mmu_page_hash_collisions) kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } /* Caches used when allocating a new shadow page. */ struct shadow_page_caches { struct kvm_mmu_memory_cache *page_header_cache; struct kvm_mmu_memory_cache *shadow_page_cache; struct kvm_mmu_memory_cache *shadowed_info_cache; }; static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm, struct shadow_page_caches *caches, gfn_t gfn, struct hlist_head *sp_list, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; sp = kvm_mmu_memory_cache_alloc(caches->page_header_cache); sp->spt = kvm_mmu_memory_cache_alloc(caches->shadow_page_cache); if (!role.direct && role.level <= KVM_MAX_HUGEPAGE_LEVEL) sp->shadowed_translation = kvm_mmu_memory_cache_alloc(caches->shadowed_info_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); /* * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() * depends on valid pages being added to the head of the list. See * comments in kvm_zap_obsolete_pages(). */ sp->mmu_valid_gen = kvm->arch.mmu_valid_gen; list_add(&sp->link, &kvm->arch.active_mmu_pages); kvm_account_mmu_page(kvm, sp); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, sp_list); if (sp_has_gptes(sp)) account_shadowed(kvm, sp); return sp; } /* Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. */ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, struct kvm_vcpu *vcpu, struct shadow_page_caches *caches, gfn_t gfn, union kvm_mmu_page_role role) { struct hlist_head *sp_list; struct kvm_mmu_page *sp; bool created = false; sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role); if (!sp) { created = true; sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role); } trace_kvm_mmu_get_page(sp, created); return sp; } static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, gfn_t gfn, union kvm_mmu_page_role role) { struct shadow_page_caches caches = { .page_header_cache = &vcpu->arch.mmu_page_header_cache, .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, }; return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); } static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, unsigned int access) { struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep); union kvm_mmu_page_role role; role = parent_sp->role; role.level--; role.access = access; role.direct = direct; role.passthrough = 0; /* * If the guest has 4-byte PTEs then that means it's using 32-bit, * 2-level, non-PAE paging. KVM shadows such guests with PAE paging * (i.e. 8-byte PTEs). The difference in PTE size means that KVM must * shadow each guest page table with multiple shadow page tables, which * requires extra bookkeeping in the role. * * Specifically, to shadow the guest's page directory (which covers a * 4GiB address space), KVM uses 4 PAE page directories, each mapping * 1GiB of the address space. @role.quadrant encodes which quarter of * the address space each maps. * * To shadow the guest's page tables (which each map a 4MiB region), KVM * uses 2 PAE page tables, each mapping a 2MiB region. For these, * @role.quadrant encodes which half of the region they map. * * Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE * consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow * PDPTEs; those 4 PAE page directories are pre-allocated and their * quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes * bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume * bit 21 in the PTE (the child here), KVM propagates that bit to the * quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE * covers bit 21 (see above), thus the quadrant is calculated from the * _least_ significant bit of the PDE index. */ if (role.has_4_byte_gpte) { WARN_ON_ONCE(role.level != PG_LEVEL_4K); role.quadrant = spte_index(sptep) & 1; } return role; } static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, bool direct, unsigned int access) { union kvm_mmu_page_role role; if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) return ERR_PTR(-EEXIST); role = kvm_mmu_child_role(sptep, direct, access); return kvm_mmu_get_shadow_page(vcpu, gfn, role); } static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) { iterator->addr = addr; iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->root_role.level; if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->root_role.direct) iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here. */ BUG_ON(root != vcpu->arch.mmu->root.hpa); iterator->shadow_addr = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= SPTE_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) iterator->level = 0; } } static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, u64 addr) { shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root.hpa, addr); } static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PG_LEVEL_4K) return false; iterator->index = SPTE_INDEX(iterator->addr, iterator->level); iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; return true; } static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, u64 spte) { if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK; --iterator->level; } static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) { __shadow_walk_next(iterator, *iterator->sptep); } static void __link_shadow_page(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, u64 *sptep, struct kvm_mmu_page *sp, bool flush) { u64 spte; BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); /* * If an SPTE is present already, it must be a leaf and therefore * a large one. Drop it, and flush the TLB if needed, before * installing sp. */ if (is_shadow_present_pte(*sptep)) drop_large_spte(kvm, sptep, flush); spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); mmu_spte_set(sptep, spte); mmu_page_add_parent_pte(cache, sp, sptep); /* * The non-direct sub-pagetable must be updated before linking. For * L1 sp, the pagetable is updated via kvm_sync_page() in * kvm_mmu_find_shadow_page() without write-protecting the gfn, * so sp->unsync can be true or false. For higher level non-direct * sp, the pagetable is updated/synced via mmu_sync_children() in * FNAME(fetch)(), so sp->unsync_children can only be false. * WARN_ON_ONCE() if anything happens unexpectedly. */ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync) mark_unsync(sptep); } static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *sp) { __link_shadow_page(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, true); } static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; /* * For the direct sp, if the guest pte's dirty bit * changed form clean to dirty, it will corrupt the * sp's access: allow writable in the read-only sp, * so we should update the spte at this point to get * a new sp with the correct access. */ child = spte_to_child_sp(*sptep); if (child->role.access == direct_access) return; drop_parent_pte(vcpu->kvm, child, sptep); kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep); } } /* Returns the number of zapped non-leaf child shadow pages. */ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *spte, struct list_head *invalid_list) { u64 pte; struct kvm_mmu_page *child; pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); } else { child = spte_to_child_sp(pte); drop_parent_pte(kvm, child, spte); /* * Recursively zap nested TDP SPs, parentless SPs are * unlikely to be used again in the near future. This * avoids retaining a large number of stale nested SPs. */ if (tdp_enabled && invalid_list && child->role.guest_mode && !child->parent_ptes.val) return kvm_mmu_prepare_zap_page(kvm, child, invalid_list); } } else if (is_mmio_spte(kvm, pte)) { mmu_spte_clear_no_track(spte); } return 0; } static int kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int zapped = 0; unsigned i; for (i = 0; i < SPTE_ENT_PER_PAGE; ++i) zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); return zapped; } static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *sptep; struct rmap_iterator iter; while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) drop_parent_pte(kvm, sp, sptep); } static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *parent, struct list_head *invalid_list) { int i, zapped = 0; struct mmu_page_path parents; struct kvm_mmu_pages pages; if (parent->role.level == PG_LEVEL_4K) return 0; while (mmu_unsync_walk(parent, &pages)) { struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); mmu_pages_clear_parents(&parents); zapped++; } } return zapped; } static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list, int *nr_zapped) { bool list_unstable, zapped_root = false; lockdep_assert_held_write(&kvm->mmu_lock); trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); kvm_mmu_unlink_parents(kvm, sp); /* Zapping children means active_mmu_pages has become unstable. */ list_unstable = *nr_zapped; if (!sp->role.invalid && sp_has_gptes(sp)) unaccount_shadowed(kvm, sp); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { /* Count self */ (*nr_zapped)++; /* * Already invalid pages (previously active roots) are not on * the active page list. See list_del() in the "else" case of * !sp->root_count. */ if (sp->role.invalid) list_add(&sp->link, invalid_list); else list_move(&sp->link, invalid_list); kvm_unaccount_mmu_page(kvm, sp); } else { /* * Remove the active root from the active page list, the root * will be explicitly freed when the root_count hits zero. */ list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also * treats invalid shadow pages as being obsolete. */ zapped_root = !is_obsolete_sp(kvm, sp); } if (sp->nx_huge_page_disallowed) unaccount_nx_huge_page(kvm, sp); sp->role.invalid = 1; /* * Make the request to free obsolete roots after marking the root * invalid, otherwise other vCPUs may not see it as invalid. */ if (zapped_root) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); return list_unstable; } static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list) { int nr_zapped; __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); return nr_zapped; } static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list) { struct kvm_mmu_page *sp, *nsp; if (list_empty(invalid_list)) return; /* * We need to make sure everyone sees our modifications to * the page tables and see changes to vcpu->mode here. The barrier * in the kvm_flush_remote_tlbs() achieves this. This pairs * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. * * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit * guest mode and/or lockless shadow page table walks. */ kvm_flush_remote_tlbs(kvm); list_for_each_entry_safe(sp, nsp, invalid_list, link) { WARN_ON_ONCE(!sp->role.invalid || sp->root_count); kvm_mmu_free_shadow_page(sp); } } static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, unsigned long nr_to_zap) { unsigned long total_zapped = 0; struct kvm_mmu_page *sp, *tmp; LIST_HEAD(invalid_list); bool unstable; int nr_zapped; if (list_empty(&kvm->arch.active_mmu_pages)) return 0; restart: list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { /* * Don't zap active root pages, the page itself can't be freed * and zapping it will just force vCPUs to realloc and reload. */ if (sp->root_count) continue; unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &nr_zapped); total_zapped += nr_zapped; if (total_zapped >= nr_to_zap) break; if (unstable) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm->stat.mmu_recycled += total_zapped; return total_zapped; } static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) { if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) return kvm->arch.n_max_mmu_pages - kvm->arch.n_used_mmu_pages; return 0; } static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); /* * Note, this check is intentionally soft, it only guarantees that one * page is available, while the caller may end up allocating as many as * four pages, e.g. for PAE roots or for 5-level paging. Temporarily * exceeding the (arbitrary by default) limit will not harm the host, * being too aggressive may unnecessarily kill the guest, and getting an * exact count is far more trouble than it's worth, especially in the * page fault paths. */ if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; return 0; } /* * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { write_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - goal_nr_mmu_pages); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; write_unlock(&kvm->mmu_lock); } bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, bool always_retry) { struct kvm *kvm = vcpu->kvm; LIST_HEAD(invalid_list); struct kvm_mmu_page *sp; gpa_t gpa = cr2_or_gpa; bool r = false; /* * Bail early if there aren't any write-protected shadow pages to avoid * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked * by a third party. Reading indirect_shadow_pages without holding * mmu_lock is safe, as this is purely an optimization, i.e. a false * positive is benign, and a false negative will simply result in KVM * skipping the unprotect+retry path, which is also an optimization. */ if (!READ_ONCE(kvm->arch.indirect_shadow_pages)) goto out; if (!vcpu->arch.mmu->root_role.direct) { gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); if (gpa == INVALID_GPA) goto out; } write_lock(&kvm->mmu_lock); for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa)) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); /* * Snapshot the result before zapping, as zapping will remove all list * entries, i.e. checking the list later would yield a false negative. */ r = !list_empty(&invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); out: if (r || always_retry) { vcpu->arch.last_retry_eip = kvm_rip_read(vcpu); vcpu->arch.last_retry_addr = cr2_or_gpa; } return r; } static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); ++kvm->stat.mmu_unsync; sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); } /* * Attempt to unsync any shadow pages that can be reached by the specified gfn, * KVM is creating a writable mapping for said gfn. Returns 0 if all pages * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, bool synchronizing, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; /* * Force write-protection if the page is being tracked. Note, the page * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ if (kvm_gfn_is_write_tracked(kvm, slot, gfn)) return -EPERM; /* * The page is not write-tracked, mark existing shadow pages unsync * unless KVM is synchronizing an unsync SP. In that case, KVM must * complete emulation of the guest TLB flush before allowing shadow * pages to become unsync (writable by the guest). */ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) { if (synchronizing) return -EPERM; if (sp->unsync) continue; if (prefetch) return -EEXIST; /* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync * logic is not thread safe. Take the spinklock regardless of * the MMU type to avoid extra conditionals/parameters, there's * no meaningful penalty if mmu_lock is held for write. */ if (!locked) { locked = true; spin_lock(&kvm->arch.mmu_unsync_pages_lock); /* * Recheck after taking the spinlock, a different vCPU * may have since marked the page unsync. A false * negative on the unprotected check above is not * possible as clearing sp->unsync _must_ hold mmu_lock * for write, i.e. unsync cannot transition from 1->0 * while this CPU holds mmu_lock for read (or write). */ if (READ_ONCE(sp->unsync)) continue; } WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(kvm, sp); } if (locked) spin_unlock(&kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible * before the SPTE is updated to allow writes because * kvm_mmu_sync_roots() checks the unsync flags without holding * the MMU lock and so can race with this. If the SPTE was updated * before the page had been marked as unsync-ed, something like the * following could happen: * * CPU 1 CPU 2 * --------------------------------------------------------------------- * 1.2 Host updates SPTE * to be writable * 2.1 Guest writes a GPTE for GVA X. * (GPTE being in the guest page table shadowed * by the SP from CPU 1.) * This reads SPTE during the page table walk. * Since SPTE.W is read as 1, there is no * fault. * * 2.2 Guest issues TLB flush. * That causes a VM Exit. * * 2.3 Walking of unsync pages sees sp->unsync is * false and skips the page. * * 2.4 Guest accesses GVA X. * Since the mapping in the SP was not updated, * so the old mapping for GVA X incorrectly * gets used. * 1.1 Host marks SP * as unsync * (sp->unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus * the situation in 2.4 does not arise. It pairs with the read barrier * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. */ smp_wmb(); return 0; } static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, u64 *sptep, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = sptep_to_sp(sptep); int level = sp->role.level; int was_rmapped = 0; int ret = RET_PF_FIXED; bool flush = false; bool wrprot; u64 spte; /* Prefetching always gets a writable pfn. */ bool host_writable = !fault || fault->map_writable; bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; if (unlikely(is_noslot_pfn(pfn))) { vcpu->stat.pf_mmio_spte_created++; mark_mmio_spte(vcpu, sptep, gfn, pte_access); return RET_PF_EMULATE; } if (is_shadow_present_pte(*sptep)) { if (prefetch) return RET_PF_SPURIOUS; /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; child = spte_to_child_sp(pte); drop_parent_pte(vcpu->kvm, child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { drop_spte(vcpu->kvm, sptep); flush = true; } else was_rmapped = 1; } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, false, host_writable, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; } else { flush |= mmu_spte_update(sptep, spte); trace_kvm_mmu_set_spte(level, gfn, sptep); } if (wrprot && write_fault) ret = RET_PF_WRITE_PROTECTED; if (flush) kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); if (!was_rmapped) { WARN_ON_ONCE(ret == RET_PF_SPURIOUS); rmap_add(vcpu, slot, sptep, gfn, pte_access); } else { /* Already rmapped but the pte_access bits may have changed. */ kvm_mmu_page_set_access(sp, spte_index(sptep), pte_access); } return ret; } static bool kvm_mmu_prefetch_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *sptep, int nr_pages, unsigned int access) { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; int i; if (WARN_ON_ONCE(nr_pages > PTE_PREFETCH_NUM)) return false; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); if (!slot) return false; nr_pages = kvm_prefetch_pages(slot, gfn, pages, nr_pages); if (nr_pages <= 0) return false; for (i = 0; i < nr_pages; i++, gfn++, sptep++) { mmu_set_spte(vcpu, slot, sptep, access, gfn, page_to_pfn(pages[i]), NULL); /* * KVM always prefetches writable pages from the primary MMU, * and KVM can make its SPTE writable in the fast page handler, * without notifying the primary MMU. Mark pages/folios dirty * now to ensure file data is written back if it ends up being * written by the guest. Because KVM's prefetching GUPs * writable PTEs, the probability of unnecessary writeback is * extremely low. */ kvm_release_page_dirty(pages[i]); } return true; } static bool direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) { gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(start)); unsigned int access = sp->role.access; return kvm_mmu_prefetch_sptes(vcpu, gfn, start, end - start, access); } static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *sptep) { u64 *spte, *start = NULL; int i; WARN_ON_ONCE(!sp->role.direct); i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { if (is_shadow_present_pte(*spte) || spte == sptep) { if (!start) continue; if (!direct_pte_prefetch_many(vcpu, sp, start, spte)) return; start = NULL; } else if (!start) start = spte; } if (start) direct_pte_prefetch_many(vcpu, sp, start, spte); } static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between * actually accessed translations and prefetched, so disable pte * prefetch if accessed bits aren't available. */ if (sp_ad_disabled(sp)) return; if (sp->role.level > PG_LEVEL_4K) return; /* * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; __direct_pte_prefetch(vcpu, sp, sptep); } /* * Lookup the mapping level for @gfn in the current mm. * * WARNING! Use of host_pfn_mapping_level() requires the caller and the end * consumer to be tied into KVM's handlers for MMU notifier events! * * There are several ways to safely use this helper: * * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation * event for the hva. This can be done by explicit checking the MMU notifier * or by ensuring that KVM already has a valid mapping that covers the hva. * * - Do not use the result to install new mappings, e.g. use the host mapping * level only to decide whether or not to zap an entry. In this case, it's * not required to hold mmu_lock (though it's highly likely the caller will * want to hold mmu_lock anyways, e.g. to modify SPTEs). * * Note! The lookup can still race with modifications to host page tables, but * the above "rules" ensure KVM will not _consume_ the result of the walk if a * race with the primary MMU occurs. */ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, const struct kvm_memory_slot *slot) { int level = PG_LEVEL_4K; unsigned long hva; unsigned long flags; pgd_t pgd; p4d_t p4d; pud_t pud; pmd_t pmd; /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() * is not solely for performance, it's also necessary to avoid the * "writable" check in __gfn_to_hva_many(), which will always fail on * read-only memslots due to gfn_to_hva() assuming writes. Earlier * page fault steps have already verified the guest isn't writing a * read-only memslot. */ hva = __gfn_to_hva_memslot(slot, gfn); /* * Disable IRQs to prevent concurrent tear down of host page tables, * e.g. if the primary MMU promotes a P*D to a huge page and then frees * the original page table. */ local_irq_save(flags); /* * Read each entry once. As above, a non-leaf entry can be promoted to * a huge page _during_ this walk. Re-reading the entry could send the * walk into the weeks, e.g. p*d_leaf() returns false (sees the old * value) and then p*d_offset() walks into the target huge page instead * of the old page table (sees the new value). */ pgd = READ_ONCE(*pgd_offset(kvm->mm, hva)); if (pgd_none(pgd)) goto out; p4d = READ_ONCE(*p4d_offset(&pgd, hva)); if (p4d_none(p4d) || !p4d_present(p4d)) goto out; pud = READ_ONCE(*pud_offset(&p4d, hva)); if (pud_none(pud) || !pud_present(pud)) goto out; if (pud_leaf(pud)) { level = PG_LEVEL_1G; goto out; } pmd = READ_ONCE(*pmd_offset(&pud, hva)); if (pmd_none(pmd) || !pmd_present(pmd)) goto out; if (pmd_leaf(pmd)) level = PG_LEVEL_2M; out: local_irq_restore(flags); return level; } static int __kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, int max_level, bool is_private) { struct kvm_lpage_info *linfo; int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) break; } if (is_private) return max_level; if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; host_level = host_pfn_mapping_level(kvm, gfn, slot); return min(host_level, max_level); } int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn) { bool is_private = kvm_slot_can_be_private(slot) && kvm_mem_is_private(kvm, gfn); return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); } void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; kvm_pfn_t mask; fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; if (unlikely(fault->max_level == PG_LEVEL_4K)) return; if (is_error_noslot_pfn(fault->pfn)) return; if (kvm_slot_dirty_track_enabled(slot)) return; /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, fault->gfn, fault->max_level, fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; /* * mmu_invalidate_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ fault->goal_level = fault->req_level; mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); fault->pfn &= ~mask; } void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) { if (cur_level > PG_LEVEL_4K && cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte) && spte_to_child_sp(spte)->nx_huge_page_disallowed) { /* * A small SPTE exists for this pfn, but FNAME(fetch), * direct_map(), or kvm_tdp_mmu_map() would like to create a * large PTE instead: just force them to go down another level, * patching back for them into pfn the next 9 bits of the * address. */ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - KVM_PAGES_PER_HPAGE(cur_level - 1); fault->pfn |= fault->gfn & page_mask; fault->goal_level--; } } static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; int ret; gfn_t base_gfn = fault->gfn; kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); if (sp == ERR_PTR(-EEXIST)) continue; link_shadow_page(vcpu, it.sptep, sp); if (fault->huge_page_disallowed) account_nx_huge_page(vcpu->kvm, sp, fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; direct_pte_prefetch(vcpu, it.sptep); return ret; } static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long hva = gfn_to_hva_memslot(slot, gfn); send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current); } static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (is_sigpending_pfn(fault->pfn)) { kvm_handle_signal_exit(vcpu); return -EINTR; } /* * Do not cache the mmio info caused by writing the readonly gfn * into the spte otherwise read access on readonly gfn also can * caused mmio page fault and treat it as mmio access. */ if (fault->pfn == KVM_PFN_ERR_RO_FAULT) return RET_PF_EMULATE; if (fault->pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(fault->slot, fault->gfn); return RET_PF_RETRY; } return -EFAULT; } static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { gva_t gva = fault->is_tdp ? 0 : fault->addr; if (fault->is_private) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; /* * If MMIO caching is disabled, emulate immediately without * touching the shadow page tables as attempting to install an * MMIO SPTE will just be an expensive nop. */ if (unlikely(!enable_mmio_caching)) return RET_PF_EMULATE; /* * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR, * any guest that generates such gfns is running nested and is being * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and * only if L1's MAXPHYADDR is inaccurate with respect to the * hardware's). */ if (unlikely(fault->gfn > kvm_mmu_max_gfn())) return RET_PF_EMULATE; return RET_PF_CONTINUE; } static bool page_fault_can_be_fast(struct kvm *kvm, struct kvm_page_fault *fault) { /* * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only * reach the common page fault handler if the SPTE has an invalid MMIO * generation number. Refreshing the MMIO generation needs to go down * the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag! */ if (fault->rsvd) return false; /* * For hardware-protected VMs, certain conditions like attempting to * perform a write to a page which is not in the state that the guest * expects it to be in can result in a nested/extended #PF. In this * case, the below code might misconstrue this situation as being the * result of a write-protected access, and treat it as a spurious case * rather than taking any action to satisfy the real source of the #PF * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the * guest spinning on a #PF indefinitely, so don't attempt the fast path * in this case. * * Note that the kvm_mem_is_private() check might race with an * attribute update, but this will either result in the guest spinning * on RET_PF_SPURIOUS until the update completes, or an actual spurious * case might go down the slow path. Either case will resolve itself. */ if (kvm->arch.has_private_mem && fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) return false; /* * #PF can be fast if: * * 1. The shadow page table entry is not present and A/D bits are * disabled _by KVM_, which could mean that the fault is potentially * caused by access tracking (if enabled). If A/D bits are enabled * by KVM, but disabled by L1 for L2, KVM is forced to disable A/D * bits for L2 and employ access tracking, but the fast page fault * mechanism only supports direct MMUs. * 2. The shadow page table entry is present, the access is a write, * and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e. * the fault was caused by a write-protection violation. If the * SPTE is MMU-writable (determined later), the fault can be fixed * by setting the Writable bit, which can be done out of mmu_lock. */ if (!fault->present) return !kvm_ad_enabled; /* * Note, instruction fetches and writes are mutually exclusive, ignore * the "exec" flag. */ return fault->write; } /* * Returns true if the SPTE was fixed successfully. Otherwise, * someone else modified the SPTE from its original value. */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, u64 new_spte) { /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in * set_spte. But fast_page_fault is very unlikely to happen with PML * enabled, so we do not do this. This might result in the same GPA * to be logged in PML buffer again when the write really happens, and * eventually to be called by mark_page_dirty twice. But it's also no * harm. This also avoids the TLB flush needed after setting dirty bit * so non-PML cases won't be impacted. * * Compare with make_spte() where instead shadow_dirty_mask is set. */ if (!try_cmpxchg64(sptep, &old_spte, new_spte)) return false; if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); return true; } /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no * walk could be performed, returns NULL and *spte does not contain valid data. * * Contract: * - Must be called between walk_shadow_page_lockless_{begin,end}. * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; u64 old_spte; u64 *sptep = NULL; for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { sptep = iterator.sptep; *spte = old_spte; } return sptep; } /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte; u64 *sptep; uint retry_count = 0; if (!page_fault_can_be_fast(vcpu->kvm, fault)) return ret; walk_shadow_page_lockless_begin(vcpu); do { u64 new_spte; if (tdp_mmu_enabled) sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->gfn, &spte); else sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); /* * It's entirely possible for the mapping to have been zapped * by a different task, but the root page should always be * available as the vCPU holds a reference to its root(s). */ if (WARN_ON_ONCE(!sptep)) spte = FROZEN_SPTE; if (!is_shadow_present_pte(spte)) break; sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; /* * Check whether the memory access that caused the fault would * still cause it if it were to be performed right now. If not, * then this is a spurious fault caused by TLB lazily flushed, * or some other CPU has already fixed the PTE after the * current CPU took the fault. * * Need not check the access of upper level table entries since * they are always ACC_ALL. */ if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; break; } new_spte = spte; /* * KVM only supports fixing page faults outside of MMU lock for * direct MMUs, nested MMUs are always indirect, and KVM always * uses A/D bits for non-nested MMUs. Thus, if A/D bits are * enabled, the SPTE can't be an access-tracked SPTE. */ if (unlikely(!kvm_ad_enabled) && is_access_track_spte(spte)) new_spte = restore_acc_track_spte(new_spte) | shadow_accessed_mask; /* * To keep things simple, only SPTEs that are MMU-writable can * be made fully writable outside of mmu_lock, e.g. only SPTEs * that were write-protected for dirty-logging or access * tracking are handled here. Don't bother checking if the * SPTE is writable to prioritize running with A/D bits enabled. * The is_access_allowed() check above handles the common case * of the fault being spurious, and the SPTE is known to be * shadow-present, i.e. except for access tracking restoration * making the new SPTE writable, the check is wasteful. */ if (fault->write && is_mmu_writable_spte(spte)) { new_spte |= PT_WRITABLE_MASK; /* * Do not fix write-permission on the large spte when * dirty logging is enabled. Since we only dirty the * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access. */ if (sp->role.level > PG_LEVEL_4K && kvm_slot_dirty_track_enabled(fault->slot)) break; } /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || !is_access_allowed(fault, new_spte)) break; /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } if (++retry_count > 4) { pr_warn_once("Fast #PF retrying more than 4 times.\n"); break; } } while (true); trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); if (ret != RET_PF_INVALID) vcpu->stat.pf_fast++; return ret; } static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, struct list_head *invalid_list) { struct kvm_mmu_page *sp; if (!VALID_PAGE(*root_hpa)) return; sp = root_to_sp(*root_hpa); if (WARN_ON_ONCE(!sp)) return; if (is_tdp_mmu_page(sp)) { lockdep_assert_held_read(&kvm->mmu_lock); kvm_tdp_mmu_put_root(kvm, sp); } else { lockdep_assert_held_write(&kvm->mmu_lock); if (!--sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); } *root_hpa = INVALID_PAGE; } /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; int i; LIST_HEAD(invalid_list); bool free_active_root; WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL); BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT) && VALID_PAGE(mmu->root.hpa); if (!free_active_root) { for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && VALID_PAGE(mmu->prev_roots[i].hpa)) break; if (i == KVM_MMU_NUM_PREV_ROOTS) return; } if (is_tdp_mmu) read_lock(&kvm->mmu_lock); else write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, &invalid_list); if (free_active_root) { if (kvm_mmu_is_dummy_root(mmu->root.hpa)) { /* Nothing to cleanup for dummy roots. */ } else if (root_to_sp(mmu->root.hpa)) { mmu_free_root_page(kvm, &mmu->root.hpa, &invalid_list); } else if (mmu->pae_root) { for (i = 0; i < 4; ++i) { if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) continue; mmu_free_root_page(kvm, &mmu->pae_root[i], &invalid_list); mmu->pae_root[i] = INVALID_PAE_ROOT; } } mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; } if (is_tdp_mmu) { read_unlock(&kvm->mmu_lock); WARN_ON_ONCE(!list_empty(&invalid_list)); } else { kvm_mmu_commit_zap_page(kvm, &invalid_list); write_unlock(&kvm->mmu_lock); } } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; struct kvm_mmu_page *sp; hpa_t root_hpa; int i; /* * This should not be called while L2 is active, L2 can't invalidate * _only_ its own roots, e.g. INVVPID unconditionally exits. */ WARN_ON_ONCE(mmu->root_role.guest_mode); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { root_hpa = mmu->prev_roots[i].hpa; if (!VALID_PAGE(root_hpa)) continue; sp = root_to_sp(root_hpa); if (!sp || sp->role.guest_mode) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } kvm_mmu_free_roots(kvm, mmu, roots_to_free); } EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, u8 level) { union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; struct kvm_mmu_page *sp; role.level = level; role.quadrant = quadrant; WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); ++sp->root_count; return __pa(sp->spt); } static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u8 shadow_root_level = mmu->root_role.level; hpa_t root; unsigned i; int r; if (tdp_mmu_enabled) { if (kvm_has_mirrored_tdp(vcpu->kvm) && !VALID_PAGE(mmu->mirror_root_hpa)) kvm_tdp_mmu_alloc_root(vcpu, true); kvm_tdp_mmu_alloc_root(vcpu, false); return 0; } write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { if (WARN_ON_ONCE(!mmu->pae_root)) { r = -EIO; goto out_unlock; } for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 0, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | PT_PRESENT_MASK | shadow_me_value; } mmu->root.hpa = __pa(mmu->pae_root); } else { WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); r = -EIO; goto out_unlock; } /* root.pgd is ignored for direct MMUs. */ mmu->root.pgd = 0; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); return r; } static int mmu_first_shadow_root_alloc(struct kvm *kvm) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; int r = 0, i, bkt; /* * Check if this is the first shadow root being allocated before * taking the lock. */ if (kvm_shadow_root_allocated(kvm)) return 0; mutex_lock(&kvm->slots_arch_lock); /* Recheck, under the lock, whether this is the first shadow root. */ if (kvm_shadow_root_allocated(kvm)) goto out_unlock; /* * Check if anything actually needs to be allocated, e.g. all metadata * will be allocated upfront if TDP is disabled. */ if (kvm_memslots_have_rmaps(kvm) && kvm_page_track_write_tracking_enabled(kvm)) goto out_success; for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, bkt, slots) { /* * Both of these functions are no-ops if the target is * already allocated, so unconditionally calling both * is safe. Intentionally do NOT free allocations on * failure to avoid having to track which allocations * were made now versus when the memslot was created. * The metadata is guaranteed to be freed when the slot * is freed, and will be kept/used if userspace retries * KVM_RUN instead of killing the VM. */ r = memslot_rmap_alloc(slot, slot->npages); if (r) goto out_unlock; r = kvm_page_track_write_tracking_alloc(slot); if (r) goto out_unlock; } } /* * Ensure that shadow_root_allocated becomes true strictly after * all the related pointers are set. */ out_success: smp_store_release(&kvm->arch.shadow_root_allocated, true); out_unlock: mutex_unlock(&kvm->slots_arch_lock); return r; } static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 pdptrs[4], pm_mask; gfn_t root_gfn, root_pgd; int quadrant, i, r; hpa_t root; root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu); root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT; if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { mmu->root.hpa = kvm_mmu_get_dummy_root(); return 0; } /* * On SVM, reading PDPTRs might access guest memory, which might fault * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. */ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { pdptrs[i] = mmu->get_pdptr(vcpu, i); if (!(pdptrs[i] & PT_PRESENT_MASK)) continue; if (!kvm_vcpu_is_visible_gfn(vcpu, pdptrs[i] >> PAGE_SHIFT)) pdptrs[i] = 0; } } r = mmu_first_shadow_root_alloc(vcpu->kvm); if (r) return r; write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; /* * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, root_gfn, 0, mmu->root_role.level); mmu->root.hpa = root; goto set_root_pgd; } if (WARN_ON_ONCE(!mmu->pae_root)) { r = -EIO; goto out_unlock; } /* * We shadow a 32 bit page table. This may be a legacy 2-level * or a PAE 3-level page table. In either case we need to be aware that * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_value; if (mmu->root_role.level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; if (mmu->root_role.level == PT64_ROOT_5LEVEL) { if (WARN_ON_ONCE(!mmu->pml5_root)) { r = -EIO; goto out_unlock; } mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; } } for (i = 0; i < 4; ++i) { WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) { if (!(pdptrs[i] & PT_PRESENT_MASK)) { mmu->pae_root[i] = INVALID_PAE_ROOT; continue; } root_gfn = pdptrs[i] >> PAGE_SHIFT; } /* * If shadowing 32-bit non-PAE page tables, each PAE page * directory maps one quarter of the guest's non-PAE page * directory. Othwerise each PAE page direct shadows one guest * PAE page directory so that quadrant should be 0. */ quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0; root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL); mmu->pae_root[i] = root | pm_mask; } if (mmu->root_role.level == PT64_ROOT_5LEVEL) mmu->root.hpa = __pa(mmu->pml5_root); else if (mmu->root_role.level == PT64_ROOT_4LEVEL) mmu->root.hpa = __pa(mmu->pml4_root); else mmu->root.hpa = __pa(mmu->pae_root); set_root_pgd: mmu->root.pgd = root_pgd; out_unlock: write_unlock(&vcpu->kvm->mmu_lock); return r; } static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL; u64 *pml5_root = NULL; u64 *pml4_root = NULL; u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP * tables are allocated and initialized at root creation as there is no * equivalent level in the guest's NPT to shadow. Allocate the tables * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. */ if (mmu->root_role.direct || mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL || mmu->root_role.level < PT64_ROOT_4LEVEL) return 0; /* * NPT, the only paging mode that uses this horror, uses a fixed number * of levels for the shadow page tables, e.g. all MMUs are 4-level or * all MMus are 5-level. Thus, this can safely require that pml5_root * is allocated if the other roots are valid and pml5 is needed, as any * prior MMU would also have required pml5. */ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || (need_pml5 && mmu->pml5_root))) return -EIO; /* * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and * doesn't need to be decrypted. */ pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pae_root) return -ENOMEM; #ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml4_root) goto err_pml4; if (need_pml5) { pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); if (!pml5_root) goto err_pml5; } #endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; mmu->pml5_root = pml5_root; return 0; #ifdef CONFIG_X86_64 err_pml5: free_page((unsigned long)pml4_root); err_pml4: free_page((unsigned long)pae_root); return -ENOMEM; #endif } static bool is_unsync_root(hpa_t root) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root) || kvm_mmu_is_dummy_root(root)) return false; /* * The read barrier orders the CPU's read of SPTE.W during the page table * walk before the reads of sp->unsync/sp->unsync_children here. * * Even if another CPU was marking the SP as unsync-ed simultaneously, * any guest page table changes are not guaranteed to be visible anyway * until this VCPU issues a TLB flush strictly after those changes are * made. We only need to ensure that the other CPU sets these flags * before any actual changes to the page tables are made. The comments * in mmu_try_to_unsync_pages() describe what could go wrong if this * requirement isn't satisfied. */ smp_rmb(); sp = root_to_sp(root); /* * PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the * PDPTEs for a given PAE root need to be synchronized individually. */ if (WARN_ON_ONCE(!sp)) return false; if (sp->unsync || sp->unsync_children) return true; return false; } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; if (vcpu->arch.mmu->root_role.direct) return; if (!VALID_PAGE(vcpu->arch.mmu->root.hpa)) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root.hpa; if (!is_unsync_root(root)) return; sp = root_to_sp(root); write_lock(&vcpu->kvm->mmu_lock); mmu_sync_children(vcpu, sp, true); write_unlock(&vcpu->kvm->mmu_lock); return; } write_lock(&vcpu->kvm->mmu_lock); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu->pae_root[i]; if (IS_VALID_PAE_ROOT(root)) { sp = spte_to_child_sp(root); mmu_sync_children(vcpu, sp, true); } } write_unlock(&vcpu->kvm->mmu_lock); } void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) { unsigned long roots_to_free = 0; int i; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); /* sync prev_roots by simply freeing them */ kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free); } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t vaddr, u64 access, struct x86_exception *exception) { if (exception) exception->error_code = 0; return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); } static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { /* * A nested guest cannot use the MMIO cache if it is using nested * page tables, because cr2 is a nGPA while the cache stores GPAs. */ if (mmu_is_nested(vcpu)) return false; if (direct) return vcpu_match_mmio_gpa(vcpu, addr); return vcpu_match_mmio_gva(vcpu, addr); } /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. * * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { struct kvm_shadow_walk_iterator iterator; int leaf = -1; u64 spte; for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); __shadow_walk_next(&iterator, spte)) { leaf = iterator.level; spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf] = spte; } return leaf; } static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { int leaf; walk_shadow_page_lockless_begin(vcpu); if (is_tdp_mmu_active(vcpu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level); else leaf = get_walk(vcpu, addr, sptes, root_level); walk_shadow_page_lockless_end(vcpu); return leaf; } /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; struct rsvd_bits_validate *rsvd_check; int root, leaf, level; bool reserved = false; leaf = get_sptes_lockless(vcpu, addr, sptes, &root); if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; } *sptep = sptes[leaf]; /* * Skip reserved bits checks on the terminal leaf if it's not a valid * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by * design, always have reserved bits set. The purpose of the checks is * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. */ if (!is_shadow_present_pte(sptes[leaf])) leaf++; rsvd_check = &vcpu->arch.mmu->shadow_zero_check; for (level = root; level >= leaf; level--) reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); if (reserved) { pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", __func__, addr); for (level = root; level >= leaf; level--) pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", sptes[level], level, get_rsvd_bits(rsvd_check, sptes[level], level)); } return reserved; } static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; if (mmio_info_in_cache(vcpu, addr, direct)) return RET_PF_EMULATE; reserved = get_mmio_spte(vcpu, addr, &spte); if (WARN_ON_ONCE(reserved)) return -EINVAL; if (is_mmio_spte(vcpu->kvm, spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned int access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) return RET_PF_INVALID; if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); return RET_PF_EMULATE; } /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ return RET_PF_RETRY; } static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { if (unlikely(fault->rsvd)) return false; if (!fault->present || !fault->write) return false; /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ if (kvm_gfn_is_write_tracked(vcpu->kvm, fault->slot, fault->gfn)) return true; return false; } static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) { struct kvm_shadow_walk_iterator iterator; u64 spte; walk_shadow_page_lockless_begin(vcpu); for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) clear_sp_write_flooding_count(iterator.sptep); walk_shadow_page_lockless_end(vcpu); } static u32 alloc_apf_token(struct kvm_vcpu *vcpu) { /* make sure the token value is not 0 */ u32 id = vcpu->arch.apf.id; if (id << 12 == 0) vcpu->arch.apf.id = 1; return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; } static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_arch_async_pf arch; arch.token = alloc_apf_token(vcpu); arch.gfn = fault->gfn; arch.error_code = fault->error_code; arch.direct_map = vcpu->arch.mmu->root_role.direct; arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu); return kvm_setup_async_pf(vcpu, fault->addr, kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch); } void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { int r; if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS)) return; if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) || work->wakeup_all) return; r = kvm_mmu_reload(vcpu); if (unlikely(r)) return; if (!vcpu->arch.mmu->root_role.direct && work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu)) return; r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL, NULL); /* * Account fixed page faults, otherwise they'll never be counted, but * ignore stats for all other return times. Page-ready "faults" aren't * truly spurious and never trigger emulation */ if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; } static inline u8 kvm_max_level_for_order(int order) { BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) return PG_LEVEL_1G; if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) return PG_LEVEL_2M; return PG_LEVEL_4K; } static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, u8 max_level, int gmem_order) { u8 req_max_level; if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; max_level = min(kvm_max_level_for_order(gmem_order), max_level); if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; req_max_level = kvm_x86_call(private_max_mapping_level)(kvm, pfn); if (req_max_level) max_level = min(max_level, req_max_level); return max_level; } static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, r == RET_PF_RETRY, fault->map_writable); } static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int max_order, r; if (!kvm_slot_can_be_private(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; } fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn, fault->max_level, max_order); return RET_PF_CONTINUE; } static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { unsigned int foll = fault->write ? FOLL_WRITE : 0; if (fault->is_private) return kvm_mmu_faultin_pfn_private(vcpu, fault); foll |= FOLL_NOWAIT; fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, &fault->map_writable, &fault->refcounted_page); /* * If resolving the page failed because I/O is needed to fault-in the * page, then either set up an asynchronous #PF to do the I/O, or if * doing an async #PF isn't possible, retry with I/O allowed. All * other failures are terminal, i.e. retrying won't help. */ if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) return RET_PF_CONTINUE; if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); return RET_PF_RETRY; } else if (kvm_arch_setup_async_pf(vcpu, fault)) { return RET_PF_RETRY; } } /* * Allow gup to bail on pending non-fatal signals when it's also allowed * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ foll |= FOLL_INTERRUPTIBLE; foll &= ~FOLL_NOWAIT; fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn, foll, &fault->map_writable, &fault->refcounted_page); return RET_PF_CONTINUE; } static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; struct kvm *kvm = vcpu->kvm; int ret; if (KVM_BUG_ON(kvm_is_gfn_alias(kvm, fault->gfn), kvm)) return -EFAULT; /* * Note that the mmu_invalidate_seq also serves to detect a concurrent * change in attributes. is_page_fault_stale() will detect an * invalidation relate to fault->fn and resume the guest without * installing a mapping in the page tables. */ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); /* * Now that we have a snapshot of mmu_invalidate_seq we can check for a * private vs. shared mismatch. */ if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } if (unlikely(!slot)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Retry the page fault if the gfn hit a memslot that is being deleted * or moved. This ensures any existing SPTEs for the old memslot will * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot->flags & KVM_MEMSLOT_INVALID) return RET_PF_RETRY; if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) { /* * Don't map L1's APIC access page into L2, KVM doesn't support * using APICv/AVIC to accelerate L2 accesses to L1's APIC, * i.e. the access needs to be emulated. Emulating access to * L1's APIC is also correct if L1 is accelerating L2's own * virtual APIC, but for some reason L1 also maps _L1's_ APIC * into L2. Note, vcpu_is_mmio_gpa() always treats access to * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM * uses different roots for L1 vs. L2, i.e. there is no danger * of breaking APICv/AVIC for L1. */ if (is_guest_mode(vcpu)) return kvm_handle_noslot_fault(vcpu, fault, access); /* * If the APIC access page exists but is disabled, go directly * to emulation without caching the MMIO access or creating a * MMIO SPTE. That way the cache doesn't need to be purged * when the AVIC is re-enabled. */ if (!kvm_apicv_activated(vcpu->kvm)) return RET_PF_EMULATE; } /* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. * * For mmu_lock, if there is an in-progress invalidation and the kernel * allows preemption, the invalidation task may drop mmu_lock and yield * in response to mmu_lock being contended, which is *very* counter- * productive as this vCPU can't actually make forward progress until * the invalidation completes. * * Retrying now can also avoid unnessary lock contention in the primary * MMU, as the primary MMU doesn't necessarily hold a single lock for * the duration of the invalidation, i.e. faulting in a conflicting pfn * can cause the invalidation to take longer by holding locks that are * needed to complete the invalidation. * * Do the pre-check even for non-preemtible kernels, i.e. even if KVM * will never yield mmu_lock in response to contention, as this vCPU is * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held * to detect retry guarantees the worst case latency for the vCPU. */ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; ret = __kvm_mmu_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; if (unlikely(is_error_pfn(fault->pfn))) return kvm_handle_error_pfn(vcpu, fault); if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); /* * Check again for a relevant mmu_notifier invalidation event purely to * avoid contending mmu_lock. Most invalidations will be detected by * the previous check, but checking is extremely cheap relative to the * overall cost of failing to detect the invalidation until after * mmu_lock is acquired. */ if (mmu_invalidate_retry_gfn_unsafe(kvm, fault->mmu_seq, fault->gfn)) { kvm_mmu_finish_page_fault(vcpu, fault, RET_PF_RETRY); return RET_PF_RETRY; } return RET_PF_CONTINUE; } /* * Returns true if the page fault is stale and needs to be retried, i.e. if the * root was invalidated by a memslot update or a relevant mmu_notifier fired. */ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); /* Special roots, e.g. pae_root, are not backed by shadow pages. */ if (sp && is_obsolete_sp(vcpu->kvm, sp)) return true; /* * Roots without an associated shadow page are considered invalid if * there is a pending request to free obsolete roots. The request is * only a hint that the current root _may_ be obsolete and needs to be * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs * to reload even if no vCPU is actively using the root. */ if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; /* * Check for a relevant mmu_notifier invalidation event one last time * now that mmu_lock is held, as the "unsafe" checks performed without * holding mmu_lock can get false negatives. */ return fault->slot && mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int r; /* Dummy roots are used only for shadowing bad guest roots. */ if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; r = mmu_topup_memory_caches(vcpu, false); if (r) return r; r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; r = direct_map(vcpu, fault); out_unlock: kvm_mmu_finish_page_fault(vcpu, fault, r); write_unlock(&vcpu->kvm->mmu_lock); return r; } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ fault->max_level = PG_LEVEL_2M; return direct_page_fault(vcpu, fault); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ if (WARN_ON_ONCE(fault_address >> 32)) return -EFAULT; #endif /* * Legacy #PF exception only have a 32-bit error code. Simply drop the * upper bits as KVM doesn't use them for #PF (because they are never * set), and to ensure there are no collisions with KVM-defined bits. */ if (WARN_ON_ONCE(error_code >> 32)) error_code = lower_32_bits(error_code); /* * Restrict KVM-defined flags to bits 63:32 so that it's impossible for * them to conflict with #PF error codes, which are limited to 32 bits. */ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK)); vcpu->arch.l1tf_flush_l1d = true; if (!flags) { trace_kvm_page_fault(vcpu, fault_address, error_code); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); } else { WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); #ifdef CONFIG_X86_64 static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { int r; if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_WRITE_PROTECTED; r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; r = mmu_topup_memory_caches(vcpu, false); if (r) return r; r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; r = RET_PF_RETRY; read_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = kvm_tdp_mmu_map(vcpu, fault); out_unlock: kvm_mmu_finish_page_fault(vcpu, fault, r); read_unlock(&vcpu->kvm->mmu_lock); return r; } #endif bool kvm_mmu_may_ignore_guest_pat(void) { /* * When EPT is enabled (shadow_memtype_mask is non-zero), and the VM * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to * honor the memtype from the guest's PAT so that guest accesses to * memory that is DMA'd aren't cached against the guest's wishes. As a * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA, * KVM _always_ ignores guest PAT (when EPT is enabled). */ return shadow_memtype_mask; } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { #ifdef CONFIG_X86_64 if (tdp_mmu_enabled) return kvm_tdp_mmu_page_fault(vcpu, fault); #endif return direct_page_fault(vcpu, fault); } static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, u8 *level) { int r; /* * Restrict to TDP page fault, since that's the only case where the MMU * is indexed by GPA. */ if (vcpu->arch.mmu->page_fault != kvm_tdp_page_fault) return -EOPNOTSUPP; do { if (signal_pending(current)) return -EINTR; cond_resched(); r = kvm_mmu_do_page_fault(vcpu, gpa, error_code, true, NULL, level); } while (r == RET_PF_RETRY); if (r < 0) return r; switch (r) { case RET_PF_FIXED: case RET_PF_SPURIOUS: case RET_PF_WRITE_PROTECTED: return 0; case RET_PF_EMULATE: return -ENOENT; case RET_PF_RETRY: case RET_PF_CONTINUE: case RET_PF_INVALID: default: WARN_ONCE(1, "could not fix page fault during prefault"); return -EIO; } } long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range) { u64 error_code = PFERR_GUEST_FINAL_MASK; u8 level = PG_LEVEL_4K; u64 end; int r; if (!vcpu->kvm->arch.pre_fault_allowed) return -EOPNOTSUPP; /* * reload is efficient when called repeatedly, so we can do it on * every iteration. */ r = kvm_mmu_reload(vcpu); if (r) return r; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) error_code |= PFERR_PRIVATE_ACCESS; /* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); if (r < 0) return r; /* * If the mapping that covers range->gpa can use a huge page, it * may start below it or end after range->gpa + range->size. */ end = (range->gpa & KVM_HPAGE_MASK(level)) + KVM_HPAGE_SIZE(level); return min(range->size, end - range->gpa); } static void nonpaging_init_context(struct kvm_mmu *context) { context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_spte = NULL; } static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root->hpa)) return false; if (!role.direct && pgd != root->pgd) return false; sp = root_to_sp(root->hpa); if (WARN_ON_ONCE(!sp)) return false; return role.word == sp->role.word; } /* * Find out if a previously cached root matching the new pgd/role is available, * and insert the current root as the MRU in the cache. * If a matching root is found, it is assigned to kvm_mmu->root and * true is returned. * If no match is found, kvm_mmu->root is left invalid, the LRU root is * evicted to make room for the current root, and false is returned. */ static bool cached_root_find_and_keep_current(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; if (is_root_usable(&mmu->root, new_pgd, new_role)) return true; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { /* * The swaps end up rotating the cache like this: * C 0 1 2 3 (on entry to the function) * 0 C 1 2 3 * 1 C 0 2 3 * 2 C 0 1 3 * 3 C 0 1 2 (on exit from the loop) */ swap(mmu->root, mmu->prev_roots[i]); if (is_root_usable(&mmu->root, new_pgd, new_role)) return true; } kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); return false; } /* * Find out if a previously cached root matching the new pgd/role is available. * On entry, mmu->root is invalid. * If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry * of the cache becomes invalid, and true is returned. * If no match is found, kvm_mmu->root is left invalid and false is returned. */ static bool cached_root_find_without_current(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (is_root_usable(&mmu->prev_roots[i], new_pgd, new_role)) goto hit; return false; hit: swap(mmu->root, mmu->prev_roots[i]); /* Bubble up the remaining roots. */ for (; i < KVM_MMU_NUM_PREV_ROOTS - 1; i++) mmu->prev_roots[i] = mmu->prev_roots[i + 1]; mmu->prev_roots[i].hpa = INVALID_PAGE; return true; } static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { /* * Limit reuse to 64-bit hosts+VMs without "special" roots in order to * avoid having to deal with PDPTEs and other complexities. */ if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(mmu->root.hpa)) kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT); if (VALID_PAGE(mmu->root.hpa)) return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role); else return cached_root_find_without_current(kvm, mmu, new_pgd, new_role); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) { struct kvm_mmu *mmu = vcpu->arch.mmu; union kvm_mmu_page_role new_role = mmu->root_role; /* * Return immediately if no usable root was found, kvm_mmu_reload() * will establish a valid root prior to the next VM-Enter. */ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) return; /* * It's possible that the cached previous root page is obsolete because * of a change in the MMU generation number. However, changing the * generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, * which will free the root set here and allocate a new one. */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); if (force_flush_and_sync_on_reuse) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* * The last MMIO access's GVA and GPA are cached in the VCPU. When * switching to a new CR3, that GVA->GPA mapping may no longer be * valid. So clear any cached MMIO info even when we don't need to sync * the shadow page tables. */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); /* * If this is a direct root page, it doesn't have a write flooding * count. Otherwise, clear the write flooding count. */ if (!new_role.direct) { struct kvm_mmu_page *sp = root_to_sp(vcpu->arch.mmu->root.hpa); if (!WARN_ON_ONCE(!sp)) __clear_sp_write_flooding_count(sp); } } EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned int access) { if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { mmu_spte_clear_no_track(sptep); return true; } mark_mmio_spte(vcpu, sptep, gfn, access); return true; } return false; } #define PTTYPE_EPT 18 /* arbitrary */ #define PTTYPE PTTYPE_EPT #include "paging_tmpl.h" #undef PTTYPE #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE #define PTTYPE 32 #include "paging_tmpl.h" #undef PTTYPE static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, int level, bool nx, bool gbpages, bool pse, bool amd) { u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; u64 high_bits_rsvd; rsvd_check->bad_mt_xwr = 0; if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); if (level == PT32E_ROOT_LEVEL) high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); else high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); /* Note, NX doesn't exist in PDPTEs, this is handled below. */ if (!nx) high_bits_rsvd |= rsvd_bits(63, 63); /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. */ if (amd) nonleaf_bit8_rsvd = rsvd_bits(8, 8); switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ rsvd_check->rsvd_bits_mask[0][1] = 0; rsvd_check->rsvd_bits_mask[0][0] = 0; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; if (!pse) { rsvd_check->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | high_bits_rsvd | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_5LEVEL: rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; fallthrough; case PT64_ROOT_4LEVEL: rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | nonleaf_bit8_rsvd | rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | gbpages_bit_rsvd; rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | gbpages_bit_rsvd | rsvd_bits(13, 29); rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(13, 20); /* large page */ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; break; } } static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { __reset_rsvds_bits_mask(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, context->cpu_role.base.level, is_efer_nx(context), guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), is_cr4_pse(context), guest_cpuid_is_amd_compatible(vcpu)); } static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, u64 pa_bits_rsvd, bool execonly, int huge_page_level) { u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); u64 large_1g_rsvd = 0, large_2m_rsvd = 0; u64 bad_mt_xwr; if (huge_page_level < PG_LEVEL_1G) large_1g_rsvd = rsvd_bits(7, 7); if (huge_page_level < PG_LEVEL_2M) large_2m_rsvd = rsvd_bits(7, 7); rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* large page */ rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ if (!execonly) { /* bits 0..2 must not be 100 unless VMX capabilities allow it */ bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } rsvd_check->bad_mt_xwr = bad_mt_xwr; } static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, struct kvm_mmu *context, bool execonly, int huge_page_level) { __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, vcpu->arch.reserved_gpa_bits, execonly, huge_page_level); } static inline u64 reserved_hpa_bits(void) { return rsvd_bits(kvm_host.maxphyaddr, 63); } /* * the page table on host is the shadow page table for the page * table in guest or amd nested guest, its mmu features completely * follow the features in guest. */ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ bool is_amd = true; /* KVM doesn't use 2-level page tables for the shadow MMU. */ bool is_pse = false; struct rsvd_bits_validate *shadow_zero_check; int i; WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL); shadow_zero_check = &context->shadow_zero_check; __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, context->root_role.efer_nx, guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES), is_pse, is_amd); if (!shadow_me_mask) return; for (i = context->root_role.level; --i >= 0;) { /* * So far shadow_me_value is a constant during KVM's life * time. Bits in shadow_me_value are allowed to be set. * Bits in shadow_me_mask but not in shadow_me_value are * not allowed to be set. */ shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask; shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask; shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value; shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value; } } static inline bool boot_cpu_is_amd(void) { WARN_ON_ONCE(!tdp_enabled); return shadow_x_mask == 0; } /* * the direct page table on host, use as much mmu features as * possible, however, kvm currently does not do execution-protection. */ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context) { struct rsvd_bits_validate *shadow_zero_check; int i; shadow_zero_check = &context->shadow_zero_check; if (boot_cpu_is_amd()) __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), context->root_role.level, true, boot_cpu_has(X86_FEATURE_GBPAGES), false, true); else __reset_rsvds_bits_mask_ept(shadow_zero_check, reserved_hpa_bits(), false, max_huge_page_level); if (!shadow_me_mask) return; for (i = context->root_role.level; --i >= 0;) { shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; } } /* * as the comments in reset_shadow_zero_bits_mask() except it * is the shadow page table for intel nested guest. */ static void reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly) { __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, reserved_hpa_bits(), execonly, max_huge_page_level); } #define BYTE_MASK(access) \ ((1 & (access) ? 2 : 0) | \ (2 & (access) ? 4 : 0) | \ (3 & (access) ? 8 : 0) | \ (4 & (access) ? 16 : 0) | \ (5 & (access) ? 32 : 0) | \ (6 & (access) ? 64 : 0) | \ (7 & (access) ? 128 : 0)) static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) { unsigned byte; const u8 x = BYTE_MASK(ACC_EXEC_MASK); const u8 w = BYTE_MASK(ACC_WRITE_MASK); const u8 u = BYTE_MASK(ACC_USER_MASK); bool cr4_smep = is_cr4_smep(mmu); bool cr4_smap = is_cr4_smap(mmu); bool cr0_wp = is_cr0_wp(mmu); bool efer_nx = is_efer_nx(mmu); for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { unsigned pfec = byte << 1; /* * Each "*f" variable has a 1 bit for each UWX value * that causes a fault with the given PFEC. */ /* Faults from writes to non-writable pages */ u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; /* Faults from user mode accesses to supervisor pages */ u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; /* Faults from fetches of non-executable pages*/ u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; /* Faults from kernel mode fetches of user pages */ u8 smepf = 0; /* Faults from kernel mode accesses of user pages */ u8 smapf = 0; if (!ept) { /* Faults from kernel mode accesses to user pages */ u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; /* Not really needed: !nx will cause pte.nx to fault */ if (!efer_nx) ff = 0; /* Allow supervisor writes if !cr0.wp */ if (!cr0_wp) wf = (pfec & PFERR_USER_MASK) ? wf : 0; /* Disallow supervisor fetches of user code if cr4.smep */ if (cr4_smep) smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; /* * SMAP:kernel-mode data accesses from user-mode * mappings should fault. A fault is considered * as a SMAP violation if all of the following * conditions are true: * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch * - The access is supervisor mode * - If implicit supervisor access or X86_EFLAGS_AC is clear * * Here, we cover the first four conditions. * The fifth is computed dynamically in permission_fault(); * PFERR_RSVD_MASK bit will be set in PFEC if the access is * *not* subject to SMAP restrictions. */ if (cr4_smap) smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; } mmu->permissions[byte] = ff | uf | wf | smepf | smapf; } } /* * PKU is an additional mechanism by which the paging controls access to * user-mode addresses based on the value in the PKRU register. Protection * key violations are reported through a bit in the page fault error code. * Unlike other bits of the error code, the PK bit is not known at the * call site of e.g. gva_to_gpa; it must be computed directly in * permission_fault based on two bits of PKRU, on some machine state (CR4, * CR0, EFER, CPL), and on other bits of the error code and the page tables. * * In particular the following conditions come from the error code, the * page tables and the machine state: * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) * - PK is always zero if U=0 in the page tables * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. * * The PKRU bitmask caches the result of these four conditions. The error * code (minus the P bit) and the page table's U bit form an index into the * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed * with the two bits of the PKRU register corresponding to the protection key. * For the first three conditions above the bits will be 00, thus masking * away both AD and WD. For all reads or if the last condition holds, WD * only will be masked away. */ static void update_pkru_bitmask(struct kvm_mmu *mmu) { unsigned bit; bool wp; mmu->pkru_mask = 0; if (!is_cr4_pke(mmu)) return; wp = is_cr0_wp(mmu); for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { unsigned pfec, pkey_bits; bool check_pkey, check_write, ff, uf, wf, pte_user; pfec = bit << 1; ff = pfec & PFERR_FETCH_MASK; uf = pfec & PFERR_USER_MASK; wf = pfec & PFERR_WRITE_MASK; /* PFEC.RSVD is replaced by ACC_USER_MASK. */ pte_user = pfec & PFERR_RSVD_MASK; /* * Only need to check the access which is not an * instruction fetch and is to a user page. */ check_pkey = (!ff && pte_user); /* * write access is controlled by PKRU if it is a * user access or CR0.WP = 1. */ check_write = check_pkey && wf && (uf || wp); /* PKRU.AD stops both read and write access. */ pkey_bits = !!check_pkey; /* PKRU.WD stops write access. */ pkey_bits |= (!!check_write) << 1; mmu->pkru_mask |= (pkey_bits & 3) << pfec; } } static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { if (!is_cr0_pg(mmu)) return; reset_guest_rsvds_bits_mask(vcpu, mmu); update_permission_bitmask(mmu, false); update_pkru_bitmask(mmu); } static void paging64_init_context(struct kvm_mmu *context) { context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->sync_spte = paging64_sync_spte; } static void paging32_init_context(struct kvm_mmu *context) { context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; context->sync_spte = paging32_sync_spte; } static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs) { union kvm_cpu_role role = {0}; role.base.access = ACC_ALL; role.base.smm = is_smm(vcpu); role.base.guest_mode = is_guest_mode(vcpu); role.ext.valid = 1; if (!____is_cr0_pg(regs)) { role.base.direct = 1; return role; } role.base.efer_nx = ____is_efer_nx(regs); role.base.cr0_wp = ____is_cr0_wp(regs); role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs); role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs); role.base.has_4_byte_gpte = !____is_cr4_pae(regs); if (____is_efer_lma(regs)) role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; else if (____is_cr4_pae(regs)) role.base.level = PT32E_ROOT_LEVEL; else role.base.level = PT32_ROOT_LEVEL; role.ext.cr4_smep = ____is_cr4_smep(regs); role.ext.cr4_smap = ____is_cr4_smap(regs); role.ext.cr4_pse = ____is_cr4_pse(regs); /* PKEY and LA57 are active iff long mode is active. */ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); role.ext.efer_lma = ____is_efer_lma(regs); return role; } void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP); BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS)); if (is_cr0_wp(mmu) == cr0_wp) return; mmu->cpu_role.base.cr0_wp = cr0_wp; reset_guest_paging_metadata(vcpu, mmu); } static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { /* tdp_root_level is architecture forced level, use it if nonzero */ if (tdp_root_level) return tdp_root_level; /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; return max_tdp_level; } u8 kvm_mmu_get_max_tdp_level(void) { return tdp_root_level ? tdp_root_level : max_tdp_level; } static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { union kvm_mmu_page_role role = {0}; role.access = ACC_ALL; role.cr0_wp = true; role.efer_nx = true; role.smm = cpu_role.base.smm; role.guest_mode = cpu_role.base.guest_mode; role.ad_disabled = !kvm_ad_enabled; role.level = kvm_mmu_get_tdp_level(vcpu); role.direct = true; role.has_4_byte_gpte = false; return role; } static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role); if (cpu_role.as_u64 == context->cpu_role.as_u64 && root_role.word == context->root_role.word) return; context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; context->page_fault = kvm_tdp_page_fault; context->sync_spte = NULL; context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; if (!is_cr0_pg(context)) context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_cr4_pae(context)) context->gva_to_gpa = paging64_gva_to_gpa; else context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, context); reset_tdp_shadow_zero_bits_mask(context); } static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, union kvm_cpu_role cpu_role, union kvm_mmu_page_role root_role) { if (cpu_role.as_u64 == context->cpu_role.as_u64 && root_role.word == context->root_role.word) return; context->cpu_role.as_u64 = cpu_role.as_u64; context->root_role.word = root_role.word; if (!is_cr0_pg(context)) nonpaging_init_context(context); else if (is_cr4_pae(context)) paging64_init_context(context); else paging32_init_context(context); reset_guest_paging_metadata(vcpu, context); reset_shadow_zero_bits_mask(vcpu, context); } static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_page_role root_role; root_role = cpu_role.base; /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL); /* * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role. * KVM uses NX when TDP is disabled to handle a variety of scenarios, * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. * The iTLB multi-hit workaround can be toggled at any time, so assume * NX can be used by any non-nested shadow MMU to avoid having to reset * MMU contexts. */ root_role.efer_nx = true; shadow_mmu_init_context(vcpu, context, cpu_role, root_role); } void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, unsigned long cr4, u64 efer, gpa_t nested_cr3) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs); union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); if (root_role.level == PT64_ROOT_5LEVEL && cpu_role.base.level == PT64_ROOT_4LEVEL) root_role.passthrough = 1; shadow_mmu_init_context(vcpu, context, cpu_role, root_role); kvm_mmu_new_pgd(vcpu, nested_cr3); } EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_cpu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, bool execonly, u8 level) { union kvm_cpu_role role = {0}; /* * KVM does not support SMM transfer monitors, and consequently does not * support the "entry to SMM" control either. role.base.smm is always 0. */ WARN_ON_ONCE(is_smm(vcpu)); role.base.level = level; role.base.has_4_byte_gpte = false; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; role.base.guest_mode = true; role.base.access = ACC_ALL; role.ext.word = 0; role.ext.execonly = execonly; role.ext.valid = 1; return role; } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, int huge_page_level, bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_cpu_role new_mode = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); if (new_mode.as_u64 != context->cpu_role.as_u64) { /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ context->cpu_role.as_u64 = new_mode.as_u64; context->root_role.word = new_mode.base.word; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_spte = ept_sync_spte; update_permission_bitmask(context, true); context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); reset_ept_shadow_zero_bits_mask(context, execonly); } kvm_mmu_new_pgd(vcpu, new_eptp); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) { struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, cpu_role); context->get_guest_pgd = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu, union kvm_cpu_role new_mode) { struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; if (new_mode.as_u64 == g_context->cpu_role.as_u64) return; g_context->cpu_role.as_u64 = new_mode.as_u64; g_context->get_guest_pgd = get_guest_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* * L2 page tables are never shadowed, so there is no need to sync * SPTEs. */ g_context->sync_spte = NULL; /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using * L2's page tables as the first level of translation and L1's * nested page tables as the second level of translation. Basically * the gva_to_gpa functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) g_context->gva_to_gpa = nonpaging_gva_to_gpa; else if (is_long_mode(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa; else if (is_pae(vcpu)) g_context->gva_to_gpa = paging64_gva_to_gpa; else g_context->gva_to_gpa = paging32_gva_to_gpa; reset_guest_paging_metadata(vcpu, g_context); } void kvm_init_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs); if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu, cpu_role); else if (tdp_enabled) init_kvm_tdp_mmu(vcpu, cpu_role); else init_kvm_softmmu(vcpu, cpu_role); } EXPORT_SYMBOL_GPL(kvm_init_mmu); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) { /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. * * Correctly handling multiple vCPU models with respect to paging and * physical address properties) in a single VM would require tracking * all relevant CPUID information in kvm_mmu_page_role. That is very * undesirable as it would increase the memory requirements for * gfn_write_track (see struct kvm_mmu_page_role comments). For now * that problem is swept under the rug; KVM's CPUID API is horrific and * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.root_role.invalid = 1; vcpu->arch.guest_mmu.root_role.invalid = 1; vcpu->arch.nested_mmu.root_role.invalid = 1; vcpu->arch.root_mmu.cpu_role.ext.valid = 0; vcpu->arch.guest_mmu.cpu_role.ext.valid = 0; vcpu->arch.nested_mmu.cpu_role.ext.valid = 0; kvm_mmu_reset_context(vcpu); /* * Changing guest CPUID after KVM_RUN is forbidden, see the comment in * kvm_arch_vcpu_ioctl(). */ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); kvm_init_mmu(vcpu); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct); if (r) goto out; r = mmu_alloc_special_roots(vcpu); if (r) goto out; if (vcpu->arch.mmu->root_role.direct) r = mmu_alloc_direct_roots(vcpu); else r = mmu_alloc_shadow_roots(vcpu); if (r) goto out; kvm_mmu_sync_roots(vcpu); kvm_mmu_load_pgd(vcpu); /* * Flush any TLB entries for the new root, the provenance of the root * is unknown. Even if KVM ensures there are no stale TLB entries * for a freed root, in theory another hypervisor could have left * stale entries. Flushing on alloc also allows KVM to skip the TLB * flush when freeing a root (see kvm_tdp_mmu_put_root()). */ kvm_x86_call(flush_tlb_current)(vcpu); out: return r; } void kvm_mmu_unload(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa)); kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa)); vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); } static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa) { struct kvm_mmu_page *sp; if (!VALID_PAGE(root_hpa)) return false; /* * When freeing obsolete roots, treat roots as obsolete if they don't * have an associated shadow page, as it's impossible to determine if * such roots are fresh or stale. This does mean KVM will get false * positives and free roots that don't strictly need to be freed, but * such false positives are relatively rare: * * (a) only PAE paging and nested NPT have roots without shadow pages * (or any shadow paging flavor with a dummy root, see note below) * (b) remote reloads due to a memslot update obsoletes _all_ roots * (c) KVM doesn't track previous roots for PAE paging, and the guest * is unlikely to zap an in-use PGD. * * Note! Dummy roots are unique in that they are obsoleted by memslot * _creation_! See also FNAME(fetch). */ sp = root_to_sp(root_hpa); return !sp || is_obsolete_sp(kvm, sp); } static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu) { unsigned long roots_to_free = 0; int i; if (is_obsolete_root(kvm, mmu->root.hpa)) roots_to_free |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots_to_free) kvm_mmu_free_roots(kvm, mmu, roots_to_free); } void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) { __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) { u64 gentry = 0; int r; /* * Assume that the pte write on a page table of the same type * as the current vcpu paging mode since we update the sptes only * when they have the same mode. */ if (is_pae(vcpu) && *bytes == 4) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; } if (*bytes == 4 || *bytes == 8) { r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); if (r) gentry = 0; } return gentry; } /* * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ static bool detect_write_flooding(struct kvm_mmu_page *sp) { /* * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ if (sp->role.level == PG_LEVEL_4K) return false; atomic_inc(&sp->write_flooding_count); return atomic_read(&sp->write_flooding_count) >= 3; } /* * Misaligned accesses are too much trouble to fix up; also, they usually * indicate a page is not used as a page table. */ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, int bytes) { unsigned offset, pte_size, misaligned; offset = offset_in_page(gpa); pte_size = sp->role.has_4_byte_gpte ? 4 : 8; /* * Sometimes, the OS only writes the last one bytes to update status * bits, for example, in linux, andb instruction is used in clear_bit(). */ if (!(offset & (pte_size - 1)) && bytes == 1) return false; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; return misaligned; } static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) { unsigned page_offset, quadrant; u64 *spte; int level; page_offset = offset_in_page(gpa); level = sp->role.level; *nspte = 1; if (sp->role.has_4_byte_gpte) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map * only 2MB. So we need to double the offset again * and zap two pdes instead of one. */ if (level == PT32_ROOT_LEVEL) { page_offset &= ~7; /* kill rounding error */ page_offset <<= 1; *nspte = 2; } quadrant = page_offset >> PAGE_SHIFT; page_offset &= ~PAGE_MASK; if (quadrant != sp->role.quadrant) return NULL; } spte = &sp->spt[page_offset / sizeof(*spte)]; return spte; } void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; bool flush = false; /* * When emulating guest writes, ensure the written value is visible to * any task that is handling page faults before checking whether or not * KVM is shadowing a guest PTE. This ensures either KVM will create * the correct SPTE in the page fault handler, or this task will see * a non-zero indirect_shadow_pages. Pairs with the smp_mb() in * account_shadowed(). */ smp_mb(); if (!vcpu->kvm->arch.indirect_shadow_pages) return; write_lock(&vcpu->kvm->mmu_lock); gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); ++vcpu->kvm->stat.mmu_pte_write; for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } spte = get_written_sptes(sp, gpa, &npte); if (!spte) continue; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; if (is_shadow_present_pte(entry)) flush = true; ++spte; } } kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); write_unlock(&vcpu->kvm->mmu_lock); } static bool is_write_to_guest_page_table(u64 error_code) { const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK; return (error_code & mask) == mask; } static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, int *emulation_type) { bool direct = vcpu->arch.mmu->root_role.direct; /* * Do not try to unprotect and retry if the vCPU re-faulted on the same * RIP with the same address that was previously unprotected, as doing * so will likely put the vCPU into an infinite. E.g. if the vCPU uses * a non-page-table modifying instruction on the PDE that points to the * instruction, then unprotecting the gfn will unmap the instruction's * code, i.e. make it impossible for the instruction to ever complete. */ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) && vcpu->arch.last_retry_addr == cr2_or_gpa) return RET_PF_EMULATE; /* * Reset the unprotect+retry values that guard against infinite loops. * The values will be refreshed if KVM explicitly unprotects a gfn and * retries, in all other cases it's safe to retry in the future even if * the next page fault happens on the same RIP+address. */ vcpu->arch.last_retry_eip = 0; vcpu->arch.last_retry_addr = 0; /* * It should be impossible to reach this point with an MMIO cache hit, * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid, * writable memslot, and creating a memslot should invalidate the MMIO * cache by way of changing the memslot generation. WARN and disallow * retry if MMIO is detected, as retrying MMIO emulation is pointless * and could put the vCPU into an infinite loop because the processor * will keep faulting on the non-existent MMIO address. */ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct))) return RET_PF_EMULATE; /* * Before emulating the instruction, check to see if the access was due * to a read-only violation while the CPU was walking non-nested NPT * page tables, i.e. for a direct MMU, for _guest_ page tables in L1. * If L1 is sharing (a subset of) its page tables with L2, e.g. by * having nCR3 share lower level page tables with hCR3, then when KVM * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also * unknowingly write-protecting L1's guest page tables, which KVM isn't * shadowing. * * Because the CPU (by default) walks NPT page tables using a write * access (to ensure the CPU can do A/D updates), page walks in L1 can * trigger write faults for the above case even when L1 isn't modifying * PTEs. As a result, KVM will unnecessarily emulate (or at least, try * to emulate) an excessive number of L1 instructions; because L1's MMU * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs * and thus no need to emulate in order to guarantee forward progress. * * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can * proceed without triggering emulation. If one or more shadow pages * was zapped, skip emulation and resume L1 to let it natively execute * the instruction. If no shadow pages were zapped, then the write- * fault is due to something else entirely, i.e. KVM needs to emulate, * as resuming the guest will put it into an infinite loop. * * Note, this code also applies to Intel CPUs, even though it is *very* * unlikely that an L1 will share its page tables (IA32/PAE/paging64 * format) with L2's page tables (EPT format). * * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to * unprotect the gfn and retry if an event is awaiting reinjection. If * KVM emulates multiple instructions before completing event injection, * the event could be delayed beyond what is architecturally allowed, * e.g. KVM could inject an IRQ after the TPR has been raised. */ if (((direct && is_write_to_guest_page_table(error_code)) || (!direct && kvm_event_needs_reinjection(vcpu))) && kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return RET_PF_RETRY; /* * The gfn is write-protected, but if KVM detects its emulating an * instruction that is unlikely to be used to modify page tables, or if * emulation fails, KVM can try to unprotect the gfn and let the CPU * re-execute the instruction that caused the page fault. Do not allow * retrying an instruction from a nested guest as KVM is only explicitly * shadowing L1's page tables, i.e. unprotecting something for L1 isn't * going to magically fix whatever issue caused L2 to fail. */ if (!is_guest_mode(vcpu)) *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; return RET_PF_EMULATE; } int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->root_role.direct; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) return RET_PF_RETRY; /* * Except for reserved faults (emulated MMIO is shared-only), set the * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's * current attributes, which are the source of truth for such VMs. Note, * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't * currently supported nested virtualization (among many other things) * for software-protected VMs. */ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && !(error_code & PFERR_RSVD_MASK) && vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa))) error_code |= PFERR_PRIVATE_ACCESS; r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS)) return -EFAULT; r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); if (r == RET_PF_EMULATE) goto emulate; } if (r == RET_PF_INVALID) { vcpu->stat.pf_taken++; r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false, &emulation_type, NULL); if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } if (r < 0) return r; if (r == RET_PF_WRITE_PROTECTED) r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, &emulation_type); if (r == RET_PF_FIXED) vcpu->stat.pf_fixed++; else if (r == RET_PF_EMULATE) vcpu->stat.pf_emulate++; else if (r == RET_PF_SPURIOUS) vcpu->stat.pf_spurious++; /* * None of handle_mmio_page_fault(), kvm_mmu_do_page_fault(), or * kvm_mmu_write_protect_fault() return RET_PF_CONTINUE. * kvm_mmu_do_page_fault() only uses RET_PF_CONTINUE internally to * indicate continuing the page fault handling until to the final * page table mapping phase. */ WARN_ON_ONCE(r == RET_PF_CONTINUE); if (r != RET_PF_EMULATE) return r; emulate: return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, insn_len); } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg) { u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; int root_level, leaf, level; leaf = get_sptes_lockless(vcpu, gpa, sptes, &root_level); if (unlikely(leaf < 0)) return; pr_err("%s %llx", msg, gpa); for (level = root_level; level >= leaf; level--) pr_cont(", spte[%d] = 0x%llx", level, sptes[level]); pr_cont("\n"); } EXPORT_SYMBOL_GPL(kvm_mmu_print_sptes); static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; vcpu_clear_mmio_info(vcpu, addr); /* * Walking and synchronizing SPTEs both assume they are operating in * the context of the current MMU, and would need to be reworked if * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. */ if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) return; if (!VALID_PAGE(root_hpa)) return; write_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) { struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep); if (sp->unsync) { int ret = kvm_sync_spte(vcpu, sp, iterator.index); if (ret < 0) mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL); if (ret) kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep); } if (!sp->unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); } void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots) { int i; WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL); /* It's actually a GPA for vcpu->arch.guest_mmu. */ if (mmu != &vcpu->arch.guest_mmu) { /* INVLPG on a non-canonical address is a NOP according to the SDM. */ if (is_noncanonical_invlpg_address(addr, vcpu)) return; kvm_x86_call(flush_tlb_gva)(vcpu, addr); } if (!mmu->sync_spte) return; if (roots & KVM_MMU_ROOT_CURRENT) __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (roots & KVM_MMU_ROOT_PREVIOUS(i)) __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa); } } EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { /* * INVLPG is required to invalidate any global mappings for the VA, * irrespective of PCID. Blindly sync all roots as it would take * roughly the same amount of work/time to determine whether any of the * previous roots have a global mapping. * * Mappings not reachable via the current or previous cached roots will * be synced when switching to that new cr3, so nothing needs to be * done here for them. */ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; unsigned long roots = 0; uint i; if (pcid == kvm_get_active_pcid(vcpu)) roots |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) roots |= KVM_MMU_ROOT_PREVIOUS(i); } if (roots) kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots); ++vcpu->stat.invlpg; /* * Mappings not reachable via the current cr3 or the prev_roots will be * synced when switching to that cr3, so nothing needs to be done here * for them. */ } void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; #ifdef CONFIG_X86_64 tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; #endif /* * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) max_huge_page_level = PG_LEVEL_1G; else max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); static void free_mmu_pages(struct kvm_mmu *mmu) { if (!tdp_enabled && mmu->pae_root) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { struct page *page; int i; mmu->root.hpa = INVALID_PAGE; mmu->root.pgd = 0; mmu->mirror_root_hpa = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) return 0; /* * When using PAE paging, the four PDPTEs are treated as 'root' pages, * while the PDP table is a per-vCPU construct that's allocated at MMU * creation. When emulating 32-bit mode, cr3 is only 32 bits even on * x86_64. Therefore we need to allocate the PDP table in the first * 4GB of memory, which happens to fit the DMA32 zone. TDP paging * generally doesn't use PAE paging and can skip allocating the PDP * table. The main exception, handled here, is SVM's 32-bit NPT. The * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). */ if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; mmu->pae_root = page_address(page); /* * CR3 is only 32 bits when PAE paging is used, thus it's impossible to * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so * that KVM's writes and the CPU's reads get along. Note, this is * only necessary when using shadow paging, as 64-bit NPT can get at * the C-bit even when shadowing 32-bit NPT, and SME isn't supported * by 32-bit kernels (when KVM itself uses 32-bit NPT). */ if (!tdp_enabled) set_memory_decrypted((unsigned long)mmu->pae_root, 1); else WARN_ON_ONCE(shadow_me_value); for (i = 0; i < 4; ++i) mmu->pae_root[i] = INVALID_PAE_ROOT; return 0; } int kvm_mmu_create(struct kvm_vcpu *vcpu) { int ret; vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu_shadow_page_cache.init_value = SHADOW_NONPRESENT_VALUE; if (!vcpu->arch.mmu_shadow_page_cache.init_value) vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); if (ret) return ret; ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); if (ret) goto fail_allocate_root; return ret; fail_allocate_root: free_mmu_pages(&vcpu->arch.guest_mmu); return ret; } #define BATCH_ZAP_PAGES 10 static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; int nr_zapped, batch = 0; LIST_HEAD(invalid_list); bool unstable; lockdep_assert_held(&kvm->slots_lock); restart: list_for_each_entry_safe_reverse(sp, node, &kvm->arch.active_mmu_pages, link) { /* * No obsolete valid page exists before a newly created page * since active_mmu_pages is a FIFO list. */ if (!is_obsolete_sp(kvm, sp)) break; /* * Invalid pages should never land back on the list of active * pages. Skip the bogus page, otherwise we'll get stuck in an * infinite loop if the page gets put back on the list (again). */ if (WARN_ON_ONCE(sp->role.invalid)) continue; /* * No need to flush the TLB since we're only zapping shadow * pages with an obsolete generation number and all vCPUS have * loaded a new root, i.e. the shadow pages being zapped cannot * be in active use by the guest. */ if (batch >= BATCH_ZAP_PAGES && cond_resched_rwlock_write(&kvm->mmu_lock)) { batch = 0; goto restart; } unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &nr_zapped); batch += nr_zapped; if (unstable) goto restart; } /* * Kick all vCPUs (via remote TLB flush) before freeing the page tables * to ensure KVM is not in the middle of a lockless shadow page table * walk, which may reference the pages. The remote TLB flush itself is * not required and is simply a convenient way to kick vCPUs as needed. * KVM performs a local TLB flush when allocating a new root (see * kvm_mmu_load()), and the reload in the caller ensure no vCPUs are * running with an obsolete MMU. */ kvm_mmu_commit_zap_page(kvm, &invalid_list); } /* * Fast invalidate all shadow pages and use lock-break technique * to zap obsolete pages. * * It's required when memslot is being deleted or VM is being * destroyed, in these cases, we should ensure that KVM MMU does * not use any resource of the being-deleted slot or all slots * after calling the function. */ static void kvm_mmu_zap_all_fast(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); write_lock(&kvm->mmu_lock); trace_kvm_mmu_zap_all_fast(kvm); /* * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is * held for the entire duration of zapping obsolete pages, it's * impossible for there to be multiple invalid generations associated * with *valid* shadow pages at any given time, i.e. there is exactly * one valid generation and (at most) one invalid generation. */ kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; /* * In order to ensure all vCPUs drop their soon-to-be invalid roots, * invalidating TDP MMU roots must be done while holding mmu_lock for * write and in the same critical section as making the reload request, * e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield. */ if (tdp_mmu_enabled) { /* * External page tables don't support fast zapping, therefore * their mirrors must be invalidated separately by the caller. */ kvm_tdp_mmu_invalidate_roots(kvm, KVM_DIRECT_ROOTS); } /* * Notify all vcpus to reload its shadow page table and flush TLB. * Then all vcpus will switch to new shadow page table with the new * mmu_valid_gen. * * Note: we need to do this under the protection of mmu_lock, * otherwise, vcpu would purge shadow page but miss tlb flush. */ kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS); kvm_zap_obsolete_pages(kvm); write_unlock(&kvm->mmu_lock); /* * Zap the invalidated TDP MMU roots, all SPTEs must be dropped before * returning to the caller, e.g. if the zap is in response to a memslot * deletion, mmu_notifier callbacks will be unable to reach the SPTEs * associated with the deleted memslot once the update completes, and * Deferring the zap until the final reference to the root is put would * lead to use-after-free. */ if (tdp_mmu_enabled) kvm_tdp_mmu_zap_invalidated_roots(kvm, true); } void kvm_mmu_init_vm(struct kvm *kvm) { kvm->arch.shadow_mmio_value = shadow_mmio_value; INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages); spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); if (tdp_mmu_enabled) kvm_mmu_init_tdp_mmu(kvm); kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; } static void mmu_free_vm_memory_caches(struct kvm *kvm) { kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); } void kvm_mmu_uninit_vm(struct kvm *kvm) { if (tdp_mmu_enabled) kvm_mmu_uninit_tdp_mmu(kvm); mmu_free_vm_memory_caches(kvm); } static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { const struct kvm_memory_slot *memslot; struct kvm_memslots *slots; struct kvm_memslot_iter iter; bool flush = false; gfn_t start, end; int i; if (!kvm_memslots_have_rmaps(kvm)) return flush; for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { memslot = iter.slot; start = max(gfn_start, memslot->base_gfn); end = min(gfn_end, memslot->base_gfn + memslot->npages); if (WARN_ON_ONCE(start >= end)) continue; flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start, end, true, flush); } } return flush; } /* * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end * (not including it) */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { bool flush; if (WARN_ON_ONCE(gfn_end <= gfn_start)) return; write_lock(&kvm->mmu_lock); kvm_mmu_invalidate_begin(kvm); kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); if (tdp_mmu_enabled) flush = kvm_tdp_mmu_zap_leafs(kvm, gfn_start, gfn_end, flush); if (flush) kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); kvm_mmu_invalidate_end(kvm); write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { return rmap_write_protect(rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect, start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); } } static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) { return kvm_mmu_memory_cache_nr_free_objects(cache) < min; } static bool need_topup_split_caches_or_resched(struct kvm *kvm) { if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) return true; /* * In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed * to split a single huge page. Calculating how many are actually needed * is possible but not worth the complexity. */ return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || need_topup(&kvm->arch.split_page_header_cache, 1) || need_topup(&kvm->arch.split_shadow_page_cache, 1); } static int topup_split_caches(struct kvm *kvm) { /* * Allocating rmap list entries when splitting huge pages for nested * MMUs is uncommon as KVM needs to use a list if and only if there is * more than one rmap entry for a gfn, i.e. requires an L1 gfn to be * aliased by multiple L2 gfns and/or from multiple nested roots with * different roles. Aliasing gfns when using TDP is atypical for VMMs; * a few gfns are often aliased during boot, e.g. when remapping BIOS, * but aliasing rarely occurs post-boot or for many gfns. If there is * only one rmap entry, rmap->val points directly at that one entry and * doesn't need to allocate a list. Buffer the cache by the default * capacity so that KVM doesn't have to drop mmu_lock to topup if KVM * encounters an aliased gfn or two. */ const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS + KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE; int r; lockdep_assert_held(&kvm->slots_lock); r = __kvm_mmu_topup_memory_cache(&kvm->arch.split_desc_cache, capacity, SPLIT_DESC_CACHE_MIN_NR_OBJECTS); if (r) return r; r = kvm_mmu_topup_memory_cache(&kvm->arch.split_page_header_cache, 1); if (r) return r; return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); } static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) { struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); struct shadow_page_caches caches = {}; union kvm_mmu_page_role role; unsigned int access; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); /* * Note, huge page splitting always uses direct shadow pages, regardless * of whether the huge page itself is mapped by a direct or indirect * shadow page, since the huge page region itself is being directly * mapped with smaller pages. */ role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); /* Direct SPs do not require a shadowed_info_cache. */ caches.page_header_cache = &kvm->arch.split_page_header_cache; caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; /* Safe to pass NULL for vCPU since requesting a direct SP. */ return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); } static void shadow_mmu_split_huge_page(struct kvm *kvm, const struct kvm_memory_slot *slot, u64 *huge_sptep) { struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache; u64 huge_spte = READ_ONCE(*huge_sptep); struct kvm_mmu_page *sp; bool flush = false; u64 *sptep, spte; gfn_t gfn; int index; sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { sptep = &sp->spt[index]; gfn = kvm_mmu_page_get_gfn(sp, index); /* * The SP may already have populated SPTEs, e.g. if this huge * page is aliased by multiple sptes with the same access * permissions. These entries are guaranteed to map the same * gfn-to-pfn translation since the SP is direct, so no need to * modify them. * * However, if a given SPTE points to a lower level page table, * that lower level page table may only be partially populated. * Installing such SPTEs would effectively unmap a potion of the * huge page. Unmapping guest memory always requires a TLB flush * since a subsequent operation on the unmapped regions would * fail to detect the need to flush. */ if (is_shadow_present_pte(*sptep)) { flush |= !is_last_spte(*sptep, sp->role.level); continue; } spte = make_small_spte(kvm, huge_spte, sp->role, index); mmu_spte_set(sptep, spte); __rmap_add(kvm, cache, slot, sptep, gfn, sp->role.access); } __link_shadow_page(kvm, cache, huge_sptep, sp, flush); } static int shadow_mmu_try_split_huge_page(struct kvm *kvm, const struct kvm_memory_slot *slot, u64 *huge_sptep) { struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); int level, r = 0; gfn_t gfn; u64 spte; /* Grab information for the tracepoint before dropping the MMU lock. */ gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); level = huge_sp->role.level; spte = *huge_sptep; if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { r = -ENOSPC; goto out; } if (need_topup_split_caches_or_resched(kvm)) { write_unlock(&kvm->mmu_lock); cond_resched(); /* * If the topup succeeds, return -EAGAIN to indicate that the * rmap iterator should be restarted because the MMU lock was * dropped. */ r = topup_split_caches(kvm) ?: -EAGAIN; write_lock(&kvm->mmu_lock); goto out; } shadow_mmu_split_huge_page(kvm, slot, huge_sptep); out: trace_kvm_mmu_split_huge_page(gfn, spte, level, r); return r; } static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { struct rmap_iterator iter; struct kvm_mmu_page *sp; u64 *huge_sptep; int r; restart: for_each_rmap_spte(rmap_head, &iter, huge_sptep) { sp = sptep_to_sp(huge_sptep); /* TDP MMU is enabled, so rmap only contains nested MMU SPs. */ if (WARN_ON_ONCE(!sp->role.guest_mode)) continue; /* The rmaps should never contain non-leaf SPTEs. */ if (WARN_ON_ONCE(!is_large_pte(*huge_sptep))) continue; /* SPs with level >PG_LEVEL_4K should never by unsync. */ if (WARN_ON_ONCE(sp->unsync)) continue; /* Don't bother splitting huge pages on invalid SPs. */ if (sp->role.invalid) continue; r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep); /* * The split succeeded or needs to be retried because the MMU * lock was dropped. Either way, restart the iterator to get it * back into a consistent state. */ if (!r || r == -EAGAIN) goto restart; /* The split failed and shouldn't be retried (e.g. -ENOMEM). */ break; } return false; } static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, int target_level) { int level; /* * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working * down to the target level. This ensures pages are recursively split * all the way to the target level. There's no need to split pages * already at the target level. */ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages, level, level, start, end - 1, true, true, false); } /* Must be called with the mmu_lock held in write-mode. */ void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level) { if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, false); /* * A TLB flush is unnecessary at this point for the same reasons as in * kvm_mmu_slot_try_split_huge_pages(). */ } void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, int target_level) { u64 start = memslot->base_gfn; u64 end = start + memslot->npages; if (!tdp_mmu_enabled) return; if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); kvm_shadow_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level); write_unlock(&kvm->mmu_lock); } read_lock(&kvm->mmu_lock); kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true); read_unlock(&kvm->mmu_lock); /* * No TLB flush is necessary here. KVM will flush TLBs after * write-protecting and/or clearing dirty on the newly split SPTEs to * ensure that guest writes are reflected in the dirty log before the * ioctl to enable dirty logging on this memslot completes. Since the * split SPTEs retain the write and dirty bits of the huge SPTE, it is * safe for KVM to decide if a TLB flush is necessary based on the split * SPTEs. */ } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; int need_tlb_flush = 0; struct kvm_mmu_page *sp; restart: for_each_rmap_spte(rmap_head, &iter, sptep) { sp = sptep_to_sp(sptep); /* * We cannot do huge page mapping for indirect shadow pages, * which are found on the last rmap (level = 1) when not using * tdp; such shadow pages are synced with the page table in * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ if (sp->role.direct && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn)) { kvm_zap_one_rmap_spte(kvm, rmap_head, sptep); if (kvm_available_flush_remote_tlbs_range()) kvm_flush_remote_tlbs_sptep(kvm, sptep); else need_tlb_flush = 1; goto restart; } } return need_tlb_flush; } EXPORT_SYMBOL_GPL(kvm_zap_gfn_range); static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { /* * Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap * pages that are already mapped at the maximum hugepage level. */ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true)) kvm_flush_remote_tlbs_memslot(kvm, slot); } void kvm_mmu_recover_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); kvm_rmap_zap_collapsible_sptes(kvm, slot); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_recover_huge_pages(kvm, slot); read_unlock(&kvm->mmu_lock); } } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } if (tdp_mmu_enabled) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); } /* * The caller will flush the TLBs after this function returns. * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap. */ } static void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); int ign; write_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { if (WARN_ON_ONCE(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; if (cond_resched_rwlock_write(&kvm->mmu_lock)) goto restart; } kvm_mmu_commit_zap_page(kvm, &invalid_list); if (tdp_mmu_enabled) kvm_tdp_mmu_zap_all(kvm); write_unlock(&kvm->mmu_lock); } void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_mmu_zap_all(kvm); } static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm, struct kvm_memory_slot *slot, bool flush) { LIST_HEAD(invalid_list); unsigned long i; if (list_empty(&kvm->arch.active_mmu_pages)) goto out_flush; /* * Since accounting information is stored in struct kvm_arch_memory_slot, * all MMU pages that are shadowing guest PTEs must be zapped before the * memslot is deleted, as freeing such pages after the memslot is freed * will result in use-after-free, e.g. in unaccount_shadowed(). */ for (i = 0; i < slot->npages; i++) { struct kvm_mmu_page *sp; gfn_t gfn = slot->base_gfn + i; for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); flush = false; cond_resched_rwlock_write(&kvm->mmu_lock); } } out_flush: kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); } static void kvm_mmu_zap_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_gfn_range range = { .slot = slot, .start = slot->base_gfn, .end = slot->base_gfn + slot->npages, .may_block = true, }; bool flush; write_lock(&kvm->mmu_lock); flush = kvm_unmap_gfn_range(kvm, &range); kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush); write_unlock(&kvm->mmu_lock); } static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm) { return kvm->arch.vm_type == KVM_X86_DEFAULT_VM && kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { if (kvm_memslot_flush_zap_all(kvm)) kvm_mmu_zap_all_fast(kvm); else kvm_mmu_zap_memslot(kvm, slot); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) { WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); gen &= MMIO_SPTE_GEN_MASK; /* * Generation numbers are incremented in multiples of the number of * address spaces in order to provide unique generations across all * address spaces. Strip what is effectively the address space * modifier prior to checking for a wrap of the MMIO generation so * that a wrap in any address space is detected. */ gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1); /* * The very rare case: if the MMIO generation number has wrapped, * zap all shadow pages. */ if (unlikely(gen == 0)) { kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_zap_all_fast(kvm); } } static void mmu_destroy_caches(void) { kmem_cache_destroy(pte_list_desc_cache); kmem_cache_destroy(mmu_page_header_cache); } static void kvm_wake_nx_recovery_thread(struct kvm *kvm) { /* * The NX recovery thread is spawned on-demand at the first KVM_RUN and * may not be valid even though the VM is globally visible. Do nothing, * as such a VM can't have any possible NX huge pages. */ struct vhost_task *nx_thread = READ_ONCE(kvm->arch.nx_huge_page_recovery_thread); if (nx_thread) vhost_task_wake(nx_thread); } static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) { if (nx_hugepage_mitigation_hard_disabled) return sysfs_emit(buffer, "never\n"); return param_get_bool(buffer, kp); } static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); } static void __set_nx_huge_pages(bool val) { nx_huge_pages = itlb_multihit_kvm_mitigation = val; } static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) { bool old_val = nx_huge_pages; bool new_val; if (nx_hugepage_mitigation_hard_disabled) return -EPERM; /* In "auto" mode deploy workaround only if CPU has the bug. */ if (sysfs_streq(val, "off")) { new_val = 0; } else if (sysfs_streq(val, "force")) { new_val = 1; } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); } else if (sysfs_streq(val, "never")) { new_val = 0; mutex_lock(&kvm_lock); if (!list_empty(&vm_list)) { mutex_unlock(&kvm_lock); return -EBUSY; } nx_hugepage_mitigation_hard_disabled = true; mutex_unlock(&kvm_lock); } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; } __set_nx_huge_pages(new_val); if (new_val != old_val) { struct kvm *kvm; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { mutex_lock(&kvm->slots_lock); kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); kvm_wake_nx_recovery_thread(kvm); } mutex_unlock(&kvm_lock); } return 0; } /* * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as * its default value of -1 is technically undefined behavior for a boolean. * Forward the module init call to SPTE code so that it too can handle module * params that need to be resolved/snapshot. */ void __init kvm_mmu_x86_module_init(void) { if (nx_huge_pages == -1) __set_nx_huge_pages(get_nx_auto_mode()); /* * Snapshot userspace's desire to enable the TDP MMU. Whether or not the * TDP MMU is actually enabled is determined in kvm_configure_mmu() * when the vendor module is loaded. */ tdp_mmu_allowed = tdp_mmu_enabled; kvm_mmu_spte_module_init(); } /* * The bulk of the MMU initialization is deferred until the vendor module is * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need * to be reset when a potentially different vendor module is loaded. */ int kvm_mmu_vendor_module_init(void) { int ret = -ENOMEM; /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave * and the current status quo is unlikely to change. Guardians below are * supposed to let us know if the assumption becomes false. */ BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64)); kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out; mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", sizeof(struct kvm_mmu_page), 0, SLAB_ACCOUNT, NULL); if (!mmu_page_header_cache) goto out; return 0; out: mmu_destroy_caches(); return ret; } void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); if (tdp_mmu_enabled) { read_lock(&vcpu->kvm->mmu_lock); mmu_free_root_page(vcpu->kvm, &vcpu->arch.mmu->mirror_root_hpa, NULL); read_unlock(&vcpu->kvm->mmu_lock); } free_mmu_pages(&vcpu->arch.root_mmu); free_mmu_pages(&vcpu->arch.guest_mmu); mmu_free_memory_caches(vcpu); } void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); } /* * Calculate the effective recovery period, accounting for '0' meaning "let KVM * select a halving time of 1 hour". Returns true if recovery is enabled. */ static bool calc_nx_huge_pages_recovery_period(uint *period) { /* * Use READ_ONCE to get the params, this may be called outside of the * param setters, e.g. by the kthread to compute its next timeout. */ bool enabled = READ_ONCE(nx_huge_pages); uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); if (!enabled || !ratio) return false; *period = READ_ONCE(nx_huge_pages_recovery_period_ms); if (!*period) { /* Make sure the period is not less than one second. */ ratio = min(ratio, 3600u); *period = 60 * 60 * 1000 / ratio; } return true; } static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { bool was_recovery_enabled, is_recovery_enabled; uint old_period, new_period; int err; if (nx_hugepage_mitigation_hard_disabled) return -EPERM; was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); if (err) return err; is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_wake_nx_recovery_thread(kvm); mutex_unlock(&kvm_lock); } return err; } static void kvm_recover_nx_huge_pages(struct kvm *kvm) { unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; struct kvm_memory_slot *slot; int rcu_idx; struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); bool flush = false; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); write_lock(&kvm->mmu_lock); /* * Zapping TDP MMU shadow pages, including the remote TLB flush, must * be done under RCU protection, because the pages are freed via RCU * callback. */ rcu_read_lock(); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(&kvm->arch.possible_nx_huge_pages)) break; /* * We use a separate list instead of just using active_mmu_pages * because the number of shadow pages that be replaced with an * NX huge page is expected to be relatively small compared to * the total number of shadow pages. And because the TDP MMU * doesn't use active_mmu_pages. */ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages, struct kvm_mmu_page, possible_nx_huge_page_link); WARN_ON_ONCE(!sp->nx_huge_page_disallowed); WARN_ON_ONCE(!sp->role.direct); /* * Unaccount and do not attempt to recover any NX Huge Pages * that are being dirty tracked, as they would just be faulted * back in as 4KiB pages. The NX Huge Pages in this slot will be * recovered, along with all the other huge pages in the slot, * when dirty logging is disabled. * * Since gfn_to_memslot() is relatively expensive, it helps to * skip it if it the test cannot possibly return true. On the * other hand, if any memslot has logging enabled, chances are * good that all of them do, in which case unaccount_nx_huge_page() * is much cheaper than zapping the page. * * If a memslot update is in progress, reading an incorrect value * of kvm->nr_memslots_dirty_logging is not a problem: if it is * becoming zero, gfn_to_memslot() will be done unnecessarily; if * it is becoming nonzero, the page will be zapped unnecessarily. * Either way, this only affects efficiency in racy situations, * and not correctness. */ slot = NULL; if (atomic_read(&kvm->nr_memslots_dirty_logging)) { struct kvm_memslots *slots; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); WARN_ON_ONCE(!slot); } if (slot && kvm_slot_dirty_track_enabled(slot)) unaccount_nx_huge_page(kvm, sp); else if (is_tdp_mmu_page(sp)) flush |= kvm_tdp_mmu_zap_sp(kvm, sp); else kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->nx_huge_page_disallowed); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); cond_resched_rwlock_write(&kvm->mmu_lock); flush = false; rcu_read_lock(); } } kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); } static void kvm_nx_huge_page_recovery_worker_kill(void *data) { } static bool kvm_nx_huge_page_recovery_worker(void *data) { struct kvm *kvm = data; bool enabled; uint period; long remaining_time; enabled = calc_nx_huge_pages_recovery_period(&period); if (!enabled) return false; remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period) - get_jiffies_64(); if (remaining_time > 0) { schedule_timeout(remaining_time); /* check for signals and come back */ return true; } __set_current_state(TASK_RUNNING); kvm_recover_nx_huge_pages(kvm); kvm->arch.nx_huge_page_last = get_jiffies_64(); return true; } static void kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); struct vhost_task *nx_thread; kvm->arch.nx_huge_page_last = get_jiffies_64(); nx_thread = vhost_task_create(kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, kvm, "kvm-nx-lpage-recovery"); if (!nx_thread) return; vhost_task_start(nx_thread); /* Make the task visible only once it is fully started. */ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); } int kvm_mmu_post_init_vm(struct kvm *kvm) { if (nx_hugepage_mitigation_hard_disabled) return 0; call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); if (!kvm->arch.nx_huge_page_recovery_thread) return -ENOMEM; return 0; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread); } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM * can simply ignore such slots. But if userspace is making memory * PRIVATE, then KVM must prevent the guest from accessing the memory * as shared. And if userspace is making memory SHARED and this point * is reached, then at least one page within the range was previously * PRIVATE, i.e. the slot's possible hugepage ranges are changing. * Zapping SPTEs in this case ensures KVM will reassess whether or not * a hugepage can be used for affected ranges. */ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; /* Unmap the old attribute page. */ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) range->attr_filter = KVM_FILTER_SHARED; else range->attr_filter = KVM_FILTER_PRIVATE; return kvm_unmap_gfn_range(kvm, range); } static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; } static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; } static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; } static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) { const unsigned long start = gfn; const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); if (level == PG_LEVEL_2M) return kvm_range_has_memory_attributes(kvm, start, end, ~0, attrs); for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { if (hugepage_test_mixed(slot, gfn, level - 1) || attrs != kvm_get_memory_attributes(kvm, gfn)) return false; } return true; } bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { unsigned long attrs = range->arg.attributes; struct kvm_memory_slot *slot = range->slot; int level; lockdep_assert_held_write(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); /* * Calculate which ranges can be mapped with hugepages even if the slot * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over * a range that has PRIVATE GFNs, and conversely converting a range to * SHARED may now allow hugepages. */ if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; /* * The sequence matters here: upper levels consume the result of lower * level's scanning. */ for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); gfn_t gfn = gfn_round_for_level(range->start, level); /* Process the head page if it straddles the range. */ if (gfn != range->start || gfn + nr_pages > range->end) { /* * Skip mixed tracking if the aligned gfn isn't covered * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ if (gfn >= slot->base_gfn && gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } gfn += nr_pages; } /* * Pages entirely covered by the range are guaranteed to have * only the attributes which were just set. */ for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) hugepage_clear_mixed(slot, gfn, level); /* * Process the last tail page if it straddles the range and is * contained by the memslot. Like the head page, KVM can't * create a hugepage if the slot size is misaligned. */ if (gfn < range->end && (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } } return false; } void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, struct kvm_memory_slot *slot) { int level; if (!kvm_arch_has_private_mem(kvm)) return; for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { /* * Don't bother tracking mixed attributes for pages that can't * be huge due to alignment, i.e. process only pages that are * entirely contained by the memslot. */ gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); gfn_t start = gfn_round_for_level(slot->base_gfn, level); gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); gfn_t gfn; if (start < slot->base_gfn) start += nr_pages; /* * Unlike setting attributes, every potential hugepage needs to * be manually checked as the attributes may already be mixed. */ for (gfn = start; gfn < end; gfn += nr_pages) { unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else hugepage_set_mixed(slot, gfn, level); } } } #endif
5 6 76 31 39 75 10 16 35 9 184 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _NET_ETHTOOL_NETLINK_H #define _NET_ETHTOOL_NETLINK_H #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/sock.h> struct ethnl_req_info; int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *nest, struct net *net, struct netlink_ext_ack *extack, bool require_dev); int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype); struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); /** * ethnl_strz_size() - calculate attribute length for fixed size string * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Return: total length of an attribute with null terminated string from @s */ static inline int ethnl_strz_size(const char *s) { return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); } /** * ethnl_put_strz() - put string attribute with fixed size string * @skb: skb with the message * @attrtype: attribute type * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Puts an attribute with null terminated string from @s into the message. * * Return: 0 on success, negative error code on failure */ static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, const char *s) { unsigned int len = strnlen(s, ETH_GSTRING_LEN); struct nlattr *attr; attr = nla_reserve(skb, attrtype, len + 1); if (!attr) return -EMSGSIZE; memcpy(nla_data(attr), s, len); ((char *)nla_data(attr))[len] = '\0'; return 0; } /** * ethnl_update_u32() - update u32 value from NLA_U32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u32 value from NLA_U32 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, bool *mod) { u32 val; if (!attr) return; val = nla_get_u32(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_u8() - update u8 value from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u8 value from NLA_U8 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = nla_get_u8(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool() - updateb bool used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the bool value from NLA_U8 netlink attribute @attr to set bool variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool(bool *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_binary() - update binary data from NLA_BINARY attribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block * of length @len at @dst by attribute payload; do nothing if @attr is null. * Bool pointed to by @mod is set to true if this function changed the logical * value of *dst, otherwise it is left as is. */ static inline void ethnl_update_binary(void *dst, unsigned int len, const struct nlattr *attr, bool *mod) { if (!attr) return; if (nla_len(attr) < len) len = nla_len(attr); if (!memcmp(dst, nla_data(attr), len)) return; memcpy(dst, nla_data(attr), len); *mod = true; } /** * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Update bits in u32 value which are set in attribute's mask to values from * attribute's value. Do nothing if @attr is null or the value wouldn't change; * otherwise, set bool pointed to by @mod to true. */ static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, bool *mod) { struct nla_bitfield32 change; u32 newval; if (!attr) return; change = nla_get_bitfield32(attr); newval = (*dst & ~change.selector) | (change.value & change.selector); if (*dst == newval) return; *dst = newval; *mod = true; } /** * ethnl_reply_header_size() - total size of reply header * * This is an upper estimate so that we do not need to hold RTNL lock longer * than necessary (to prevent rename between size estimate and composing the * message). Accounts only for device ifindex and name as those are the only * attributes ethnl_fill_reply_header() puts into the reply header. */ static inline unsigned int ethnl_reply_header_size(void) { return nla_total_size(nla_total_size(sizeof(u32)) + nla_total_size(IFNAMSIZ)); } /* GET request handling */ /* Unified processing of GET requests uses two data structures: request info * and reply data. Request info holds information parsed from client request * and its stays constant through all request processing. Reply data holds data * retrieved from ethtool_ops callbacks or other internal sources which is used * to compose the reply. When processing a dump request, request info is filled * only once (when the request message is parsed) but reply data is filled for * each reply message. * * Both structures consist of part common for all request types (struct * ethnl_req_info and struct ethnl_reply_data defined below) and optional * parts specific for each request type. Common part always starts at offset 0. */ /** * struct ethnl_req_info - base type of request information for GET requests * @dev: network device the request is for (may be null) * @dev_tracker: refcount tracker for @dev reference * @flags: request flags common for all request types * @phy_index: phy_device index connected to @dev this request is for. Can be * 0 if the request doesn't target a phy, or if the @dev's attached * phy is targeted. * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at * zero offset. */ struct ethnl_req_info { struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; u32 phy_index; }; static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) { netdev_put(req_info->dev, &req_info->dev_tracker); } /** * ethnl_req_get_phydev() - Gets the phy_device targeted by this request, * if any. Must be called under rntl_lock(). * @req_info: The ethnl request to get the phy from. * @header: The netlink header, used for error reporting. * @extack: The netlink extended ACK, for error reporting. * * The caller must hold RTNL, until it's done interacting with the returned * phy_device. * * Return: A phy_device pointer corresponding either to the passed phy_index * if one is provided. If not, the phy_device attached to the * net_device targeted by this request is returned. If there's no * targeted net_device, or no phy_device is attached, NULL is * returned. If the provided phy_index is invalid, an error pointer * is returned. */ struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, const struct nlattr *header, struct netlink_ext_ack *extack); /** * struct ethnl_reply_data - base type of reply data for GET requests * @dev: device for current reply message; in single shot requests it is * equal to &ethnl_req_info.dev; in dumps it's different for each * reply message * * This is a common base for request specific structures holding data for * kernel reply message. These always embed struct ethnl_reply_data at zero * offset. */ struct ethnl_reply_data { struct net_device *dev; }; int ethnl_ops_begin(struct net_device *dev); void ethnl_ops_complete(struct net_device *dev); enum ethnl_sock_type { ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH, }; struct ethnl_sock_priv { struct net_device *dev; u32 portid; enum ethnl_sock_type type; }; int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type); /** * struct ethnl_request_ops - unified handling of GET and SET requests * @request_cmd: command id for request (GET) * @reply_cmd: command id for reply (GET_REPLY) * @hdr_attr: attribute type for request header * @req_info_size: size of request info * @reply_data_size: size of reply data * @allow_nodev_do: allow non-dump request with no device identification * @set_ntf_cmd: notification to generate on changes (SET) * @parse_request: * Parse request except common header (struct ethnl_req_info). Common * header is already filled on entry, the rest up to @repdata_offset * is zero initialized. This callback should only modify type specific * request info by parsed attributes from request message. * @prepare_data: * Retrieve and prepare data needed to compose a reply message. Calls to * ethtool_ops handlers are limited to this callback. Common reply data * (struct ethnl_reply_data) is filled on entry, type specific part after * it is zero initialized. This callback should only modify the type * specific part of reply data. Device identification from struct * ethnl_reply_data is to be used as for dump requests, it iterates * through network devices while dev member of struct ethnl_req_info * points to the device from client request. * @reply_size: * Estimate reply message size. Returned value must be sufficient for * message payload without common reply header. The callback may returned * estimate higher than actual message size if exact calculation would * not be worth the saved memory space. * @fill_reply: * Fill reply message payload (except for common header) from reply data. * The callback must not generate more payload than previously called * ->reply_size() estimated. * @cleanup_data: * Optional cleanup called when reply data is no longer needed. Can be * used e.g. to free any additional data structures outside the main * structure which were allocated by ->prepare_data(). When processing * dump requests, ->cleanup() is called for each message. * @set_validate: * Check if set operation is supported for a given device, and perform * extra input checks. Expected return values: * - 0 if the operation is a noop for the device (rare) * - 1 if operation should proceed to calling @set * - negative errno on errors * Called without any locks, just a reference on the netdev. * @set: * Execute the set operation. The implementation should return * - 0 if no configuration has changed * - 1 if configuration changed and notification should be generated * - negative errno on errors * * Description of variable parts of GET request handling when using the * unified infrastructure. When used, a pointer to an instance of this * structure is to be added to &ethnl_default_requests array and generic * handlers ethnl_default_doit(), ethnl_default_dumpit(), * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; * ethnl_default_notify() can be used in @ethnl_notify_handlers to send * notifications of the corresponding type. */ struct ethnl_request_ops { u8 request_cmd; u8 reply_cmd; u16 hdr_attr; unsigned int req_info_size; unsigned int reply_data_size; bool allow_nodev_do; u8 set_ntf_cmd; int (*parse_request)(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack); int (*prepare_data)(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, const struct genl_info *info); int (*reply_size)(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); int (*fill_reply)(struct sk_buff *skb, const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); void (*cleanup_data)(struct ethnl_reply_data *reply_data); int (*set_validate)(struct ethnl_req_info *req_info, struct genl_info *info); int (*set)(struct ethnl_req_info *req_info, struct genl_info *info); }; /* request handlers */ extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; extern const struct ethnl_request_ops ethnl_rings_request_ops; extern const struct ethnl_request_ops ethnl_channels_request_ops; extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1]; extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1]; extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1]; extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1]; extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1]; extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1]; extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1]; extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1]; extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1]; extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1]; extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1]; extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info); int ethnl_rss_dump_start(struct netlink_callback *cb); int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_phy_start(struct netlink_callback *cb); int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_phy_done(struct netlink_callback *cb); int ethnl_tsinfo_start(struct netlink_callback *cb); int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_done(struct netlink_callback *cb); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */
20 21 8 29 8 70 7 3 5 1 6 2 5 1 5 70 75 40 74 7 70 13 68 32 27 4 23 21 21 9 8 6 21 22 1 21 21 22 19 19 19 10 9 18 19 9 2 7 7 7 9 10 7 2 6 3 1 1 5 4 5 3 8 8 8 4 4 8 8 8 8 1 7 71 38 14 33 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * minix directory handling functions * * Updated to filesystem version 3 by Daniel Aragones */ #include "minix.h" #include <linux/buffer_head.h> #include <linux/highmem.h> #include <linux/swap.h> typedef struct minix_dir_entry minix_dirent; typedef struct minix3_dir_entry minix3_dirent; static int minix_readdir(struct file *, struct dir_context *); const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = minix_readdir, .fsync = generic_file_fsync, }; /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. */ static unsigned minix_last_byte(struct inode *inode, unsigned long page_nr) { unsigned last_byte = PAGE_SIZE; if (page_nr == (inode->i_size >> PAGE_SHIFT)) last_byte = inode->i_size & (PAGE_SIZE - 1); return last_byte; } static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) { struct address_space *mapping = folio->mapping; struct inode *dir = mapping->host; block_write_end(NULL, mapping, pos, len, len, folio, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } folio_unlock(folio); } static int minix_handle_dirsync(struct inode *dir) { int err; err = filemap_write_and_wait(dir->i_mapping); if (!err) err = sync_inode_metadata(dir, 1); return err; } static void *dir_get_folio(struct inode *dir, unsigned long n, struct folio **foliop) { struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); if (IS_ERR(folio)) return ERR_CAST(folio); *foliop = folio; return kmap_local_folio(folio, 0); } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) { return (void*)((char*)de + sbi->s_dirsize); } static int minix_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct minix_sb_info *sbi = minix_sb(sb); unsigned chunk_size = sbi->s_dirsize; unsigned long npages = dir_pages(inode); unsigned long pos = ctx->pos; unsigned offset; unsigned long n; ctx->pos = pos = ALIGN(pos, chunk_size); if (pos >= inode->i_size) return 0; offset = pos & ~PAGE_MASK; n = pos >> PAGE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; struct folio *folio; kaddr = dir_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) continue; p = kaddr+offset; limit = kaddr + minix_last_byte(inode, n) - chunk_size; for ( ; p <= limit; p = minix_next_entry(p, sbi)) { const char *name; __u32 inumber; if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; name = de3->name; inumber = de3->inode; } else { minix_dirent *de = (minix_dirent *)p; name = de->name; inumber = de->inode; } if (inumber) { unsigned l = strnlen(name, sbi->s_namelen); if (!dir_emit(ctx, name, l, inumber, DT_UNKNOWN)) { folio_release_kmap(folio, p); return 0; } } ctx->pos += chunk_size; } folio_release_kmap(folio, kaddr); } return 0; } static inline int namecompare(int len, int maxlen, const char * name, const char * buffer) { if (len < maxlen && buffer[len]) return 0; return !memcmp(name, buffer, len); } /* * minix_find_entry() * * finds an entry in the specified directory with the wanted name. * It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * * On Success folio_release_kmap() should be called on *foliop. */ minix_dirent *minix_find_entry(struct dentry *dentry, struct folio **foliop) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct inode * dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; unsigned long npages = dir_pages(dir); char *p; char *namx; __u32 inumber; for (n = 0; n < npages; n++) { char *kaddr, *limit; kaddr = dir_get_folio(dir, n, foliop); if (IS_ERR(kaddr)) continue; limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; namx = de3->name; inumber = de3->inode; } else { minix_dirent *de = (minix_dirent *)p; namx = de->name; inumber = de->inode; } if (!inumber) continue; if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } folio_release_kmap(*foliop, kaddr); } return NULL; found: return (minix_dirent *)p; } int minix_add_link(struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); struct folio *folio = NULL; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr, *p; minix_dirent *de; minix3_dirent *de3; loff_t pos; int err; char *namx = NULL; __u32 inumber; /* * We take care of directory expansion in the same loop * This code plays outside i_size, so it locks the page * to protect that region. */ for (n = 0; n <= npages; n++) { char *limit, *dir_end; kaddr = dir_get_folio(dir, n, &folio); if (IS_ERR(kaddr)) return PTR_ERR(kaddr); folio_lock(folio); dir_end = kaddr + minix_last_byte(dir, n); limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { de = (minix_dirent *)p; de3 = (minix3_dirent *)p; if (sbi->s_version == MINIX_V3) { namx = de3->name; inumber = de3->inode; } else { namx = de->name; inumber = de->inode; } if (p == dir_end) { /* We hit i_size */ if (sbi->s_version == MINIX_V3) de3->inode = 0; else de->inode = 0; goto got_it; } if (!inumber) goto got_it; err = -EEXIST; if (namecompare(namelen, sbi->s_namelen, name, namx)) goto out_unlock; } folio_unlock(folio); folio_release_kmap(folio, kaddr); } BUG(); return -EINVAL; got_it: pos = folio_pos(folio) + offset_in_folio(folio, p); err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) goto out_unlock; memcpy (namx, name, namelen); if (sbi->s_version == MINIX_V3) { memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4); de3->inode = inode->i_ino; } else { memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); de->inode = inode->i_ino; } dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: folio_release_kmap(folio, kaddr); return err; out_unlock: folio_unlock(folio); goto out_put; } int minix_delete_entry(struct minix_dir_entry *de, struct folio *folio) { struct inode *inode = folio->mapping->host; loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); struct minix_sb_info *sbi = minix_sb(inode->i_sb); unsigned len = sbi->s_dirsize; int err; folio_lock(folio); err = minix_prepare_chunk(folio, pos, len); if (err) { folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = 0; else de->inode = 0; dir_commit_chunk(folio, pos, len); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); } int minix_make_empty(struct inode *inode, struct inode *dir) { struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *kaddr; int err; if (IS_ERR(folio)) return PTR_ERR(folio); err = minix_prepare_chunk(folio, 0, 2 * sbi->s_dirsize); if (err) { folio_unlock(folio); goto fail; } kaddr = kmap_local_folio(folio, 0); memset(kaddr, 0, folio_size(folio)); if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)kaddr; de3->inode = inode->i_ino; strcpy(de3->name, "."); de3 = minix_next_entry(de3, sbi); de3->inode = dir->i_ino; strcpy(de3->name, ".."); } else { minix_dirent *de = (minix_dirent *)kaddr; de->inode = inode->i_ino; strcpy(de->name, "."); de = minix_next_entry(de, sbi); de->inode = dir->i_ino; strcpy(de->name, ".."); } kunmap_local(kaddr); dir_commit_chunk(folio, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); fail: folio_put(folio); return err; } /* * routine to check that the specified directory is empty (for rmdir) */ int minix_empty_dir(struct inode * inode) { struct folio *folio = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *name, *kaddr; __u32 inumber; for (i = 0; i < npages; i++) { char *p, *limit; kaddr = dir_get_folio(inode, i, &folio); if (IS_ERR(kaddr)) continue; limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; name = de3->name; inumber = de3->inode; } else { minix_dirent *de = (minix_dirent *)p; name = de->name; inumber = de->inode; } if (inumber != 0) { /* check for . and .. */ if (name[0] != '.') goto not_empty; if (!name[1]) { if (inumber != inode->i_ino) goto not_empty; } else if (name[1] != '.') goto not_empty; else if (name[2]) goto not_empty; } } folio_release_kmap(folio, kaddr); } return 1; not_empty: folio_release_kmap(folio, kaddr); return 0; } /* Releases the page */ int minix_set_link(struct minix_dir_entry *de, struct folio *folio, struct inode *inode) { struct inode *dir = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); int err; folio_lock(folio); err = minix_prepare_chunk(folio, pos, sbi->s_dirsize); if (err) { folio_unlock(folio); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = inode->i_ino; else de->inode = inode->i_ino; dir_commit_chunk(folio, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } struct minix_dir_entry *minix_dotdot(struct inode *dir, struct folio **foliop) { struct minix_sb_info *sbi = minix_sb(dir->i_sb); struct minix_dir_entry *de = dir_get_folio(dir, 0, foliop); if (!IS_ERR(de)) return minix_next_entry(de, sbi); return NULL; } ino_t minix_inode_by_name(struct dentry *dentry) { struct folio *folio; struct minix_dir_entry *de = minix_find_entry(dentry, &folio); ino_t res = 0; if (de) { struct inode *inode = folio->mapping->host; struct minix_sb_info *sbi = minix_sb(inode->i_sb); if (sbi->s_version == MINIX_V3) res = ((minix3_dirent *) de)->inode; else res = de->inode; folio_release_kmap(folio, de); } return res; }
148 99 111 80 2 136 127 10 16 122 54 15 7 34 39 3 54 22 34 19 3 17 3 3 18 15 3 10 10 10 8 7 1 13 7 6 30 19 16 3 2 24 19 2 3 5 8 1 5 3 2 1 1 18 18 14 2 2 4 4 65 1 15 11 25 50 35 49 50 42 20 15 29 6 25 41 4 47 16 41 1 32 13 12 12 27 6 31 13 11 8 10 8 7 30 5 1 1 10 2 1 9 9 6 9 2 8 10 50 50 49 8 50 8 1 41 39 40 49 1 1 26 13 41 3 18 17 5 17 14 26 28 44 13 10 34 43 17 26 43 43 8 34 49 50 49 43 35 20 4 41 1 4 43 14 21 26 21 14 5 14 21 16 19 16 5 4 2 1 19 43 39 2 35 2 18 20 31 5 4 1 12 26 35 37 5 42 25 2 1 16 16 4 2 58 58 58 9 3 1 2 10 10 16 16 1 24 24 11 33 1 1 1 3 73 70 3 3 73 10 61 9 61 31 47 3 45 23 23 67 22 50 25 47 1 3 67 60 43 17 20 19 9 61 4 58 17 41 13 45 51 9 50 57 1 2 28 30 21 21 21 21 56 1 15 32 5 1 2 2 2 58 27 29 2 82 7 2 78 18 3 15 4 15 15 13 2 1 14 15 73 33 69 16 16 69 9 7 12 12 59 5 117 114 21 20 114 54 113 171 94 165 26 19 7 170 146 126 79 159 21 64 93 68 134 33 8 37 1 36 37 27 24 7 13 14 4 22 4 4 27 27 147 147 81 2 22 32 137 136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 // SPDX-License-Identifier: GPL-2.0-only /* * inode.c * * PURPOSE * Inode handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 10/04/98 dgb Added rudimentary directory functions * 10/07/98 Fully working udf_block_map! It works! * 11/25/98 bmap altered to better support extents * 12/06/98 blf partition support in udf_iget, udf_block_map * and udf_read_inode * 12/12/98 rewrote udf_block_map to handle next extents and descs across * block boundaries (which is not actually allowed) * 12/20/98 added support for strategy 4096 * 03/07/99 rewrote udf_block_map (again) * New funcs, inode_bmap, udf_next_aext * 04/19/99 Support for writing device EA's for major/minor # */ #include "udfdecl.h" #include <linux/mm.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" #define EXTENT_MERGE_SIZE 5 #define FE_MAPPED_PERMS (FE_PERM_U_READ | FE_PERM_U_WRITE | FE_PERM_U_EXEC | \ FE_PERM_G_READ | FE_PERM_G_WRITE | FE_PERM_G_EXEC | \ FE_PERM_O_READ | FE_PERM_O_WRITE | FE_PERM_O_EXEC) #define FE_DELETE_PERMS (FE_PERM_U_DELETE | FE_PERM_G_DELETE | \ FE_PERM_O_DELETE) struct udf_map_rq; static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static int inode_getblk(struct inode *inode, struct udf_map_rq *map); static int udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, udf_pblk_t, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); static int udf_update_extents(struct inode *, struct kernel_long_ad *, int, int, struct extent_position *); static int udf_get_block_wb(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); static void __udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->cached_extent.lstart != -1) { brelse(iinfo->cached_extent.epos.bh); iinfo->cached_extent.lstart = -1; } } /* Invalidate extent cache */ static void udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); __udf_clear_extent_cache(inode); spin_unlock(&iinfo->i_extent_cache_lock); } /* Return contents of extent cache */ static int udf_read_extent_cache(struct inode *inode, loff_t bcount, loff_t *lbcount, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); int ret = 0; spin_lock(&iinfo->i_extent_cache_lock); if ((iinfo->cached_extent.lstart <= bcount) && (iinfo->cached_extent.lstart != -1)) { /* Cache hit */ *lbcount = iinfo->cached_extent.lstart; memcpy(pos, &iinfo->cached_extent.epos, sizeof(struct extent_position)); if (pos->bh) get_bh(pos->bh); ret = 1; } spin_unlock(&iinfo->i_extent_cache_lock); return ret; } /* Add extent to extent cache */ static void udf_update_extent_cache(struct inode *inode, loff_t estart, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); /* Invalidate previously cached extent */ __udf_clear_extent_cache(inode); if (pos->bh) get_bh(pos->bh); memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos)); iinfo->cached_extent.lstart = estart; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: iinfo->cached_extent.epos.offset -= sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: iinfo->cached_extent.epos.offset -= sizeof(struct long_ad); break; } spin_unlock(&iinfo->i_extent_cache_lock); } void udf_evict_inode(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; if (!is_bad_inode(inode)) { if (!inode->i_nlink) { want_delete = 1; udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); } if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } } truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); kfree(iinfo->i_data); iinfo->i_data = NULL; udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); } } static void udf_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = inode->i_size; if (to > isize) { truncate_pagecache(inode, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } } } static int udf_adinicb_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(folio->index != 0); memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, i_size_read(inode)); folio_unlock(folio); mark_inode_dirty(inode); return 0; } static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) return mpage_writepages(mapping, wbc, udf_get_block_wb); return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); } static void udf_adinicb_read_folio(struct folio *folio) { struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = i_size_read(inode); folio_fill_tail(folio, 0, iinfo->i_data + iinfo->i_lenEAttr, isize); folio_mark_uptodate(folio); } static int udf_read_folio(struct file *file, struct folio *folio) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { udf_adinicb_read_folio(folio); folio_unlock(folio); return 0; } return mpage_read_folio(folio, udf_get_block); } static void udf_readahead(struct readahead_control *rac) { struct udf_inode_info *iinfo = UDF_I(rac->mapping->host); /* * No readahead needed for in-ICB files and udf_get_block() would get * confused for such file anyway. */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return; mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { ret = block_write_begin(mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); return ret; } if (WARN_ON_ONCE(pos >= PAGE_SIZE)) return -EIO; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); *foliop = folio; if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); return 0; } static int udf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = file_inode(file); loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) return generic_write_end(file, mapping, pos, len, copied, folio, fsdata); last_pos = pos + copied; if (last_pos > inode->i_size) i_size_write(inode, last_pos); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; /* Fallback to buffered IO for in-ICB files */ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return 0; ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, iocb->ki_pos + count); return ret; } static sector_t udf_bmap(struct address_space *mapping, sector_t block) { struct udf_inode_info *iinfo = UDF_I(mapping->host); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return -EINVAL; return generic_block_bmap(mapping, block, udf_get_block); } const struct address_space_operations udf_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = udf_read_folio, .readahead = udf_readahead, .writepages = udf_writepages, .write_begin = udf_write_begin, .write_end = udf_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, .migrate_folio = buffer_migrate_folio, }; /* * Expand file stored in ICB to a normal one-block-file * * This function requires i_mutex held */ int udf_expand_file_adinicb(struct inode *inode) { struct folio *folio; struct udf_inode_info *iinfo = UDF_I(inode); int err; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { down_write(&iinfo->i_data_sem); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; } folio = __filemap_get_folio(inode->i_mapping, 0, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_KERNEL); if (IS_ERR(folio)) return PTR_ERR(folio); if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); down_write(&iinfo->i_data_sem); memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; folio_mark_dirty(folio); folio_unlock(folio); up_write(&iinfo->i_data_sem); err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ folio_lock(folio); down_write(&iinfo->i_data_sem); memcpy_from_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, inode->i_size); folio_unlock(folio); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } folio_put(folio); mark_inode_dirty(inode); return err; } #define UDF_MAP_CREATE 0x01 /* Mapping can allocate new blocks */ #define UDF_MAP_NOPREALLOC 0x02 /* Do not preallocate blocks */ #define UDF_BLK_MAPPED 0x01 /* Block was successfully mapped */ #define UDF_BLK_NEW 0x02 /* Block was freshly allocated */ struct udf_map_rq { sector_t lblk; udf_pblk_t pblk; int iflags; /* UDF_MAP_ flags determining behavior */ int oflags; /* UDF_BLK_ flags reporting results */ }; static int udf_map_block(struct inode *inode, struct udf_map_rq *map) { int ret; struct udf_inode_info *iinfo = UDF_I(inode); if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)) return -EFSCORRUPTED; map->oflags = 0; if (!(map->iflags & UDF_MAP_CREATE)) { struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; int8_t etype; down_read(&iinfo->i_data_sem); ret = inode_bmap(inode, map->lblk, &epos, &eloc, &elen, &offset, &etype); if (ret < 0) goto out_read; if (ret > 0 && etype == (EXT_RECORDED_ALLOCATED >> 30)) { map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); map->oflags |= UDF_BLK_MAPPED; ret = 0; } out_read: up_read(&iinfo->i_data_sem); brelse(epos.bh); return ret; } down_write(&iinfo->i_data_sem); /* * Block beyond EOF and prealloc extents? Just discard preallocation * as it is not useful and complicates things. */ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents) udf_discard_prealloc(inode); udf_clear_extent_cache(inode); ret = inode_getblk(inode, map); up_write(&iinfo->i_data_sem); return ret; } static int __udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int flags) { int err; struct udf_map_rq map = { .lblk = block, .iflags = flags, }; err = udf_map_block(inode, &map); if (err < 0) return err; if (map.oflags & UDF_BLK_MAPPED) { map_bh(bh_result, inode->i_sb, map.pblk); if (map.oflags & UDF_BLK_NEW) set_buffer_new(bh_result); } return 0; } int udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int flags = create ? UDF_MAP_CREATE : 0; /* * We preallocate blocks only for regular files. It also makes sense * for directories but there's a problem when to drop the * preallocation. We might use some delayed work for that but I feel * it's overengineering for a filesystem like UDF. */ if (!S_ISREG(inode->i_mode)) flags |= UDF_MAP_NOPREALLOC; return __udf_get_block(inode, block, bh_result, flags); } /* * We shouldn't be allocating blocks on page writeback since we allocate them * on page fault. We can spot dirty buffers without allocated blocks though * when truncate expands file. These however don't have valid data so we can * safely ignore them. So never allocate blocks from page writeback. */ static int udf_get_block_wb(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { return __udf_get_block(inode, block, bh_result, 0); } /* Extend the file with new blocks totaling 'new_block_bytes', * return the number of extents added */ static int udf_do_extend_file(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, loff_t new_block_bytes) { uint32_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; struct udf_inode_info *iinfo; int err; /* The previous extent is fake and we should not extend by anything * - there's nothing to do... */ if (!new_block_bytes && fake) return 0; iinfo = UDF_I(inode); /* Round the last extent up to a multiple of block size */ if (last_ext->extLength & (sb->s_blocksize - 1)) { last_ext->extLength = (last_ext->extLength & UDF_EXTENT_FLAG_MASK) | (((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1)); iinfo->i_lenExtents = (iinfo->i_lenExtents + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); } add = 0; /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { add = (1 << 30) - sb->s_blocksize - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); if (add > new_block_bytes) add = new_block_bytes; new_block_bytes -= add; last_ext->extLength += add; } if (fake) { err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err < 0) goto out_err; count++; } else { struct kernel_lb_addr tmploc; uint32_t tmplen; int8_t tmptype; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); /* * We've rewritten the last extent. If we are going to add * more extents, we may need to enter possible following * empty indirect extent. */ if (new_block_bytes) { err = udf_next_aext(inode, last_pos, &tmploc, &tmplen, &tmptype, 0); if (err < 0) goto out_err; } } iinfo->i_lenExtents += add; /* Managed to do everything necessary? */ if (!new_block_bytes) goto out; /* All further extents will be NOT_RECORDED_NOT_ALLOCATED */ last_ext->extLocation.logicalBlockNum = 0; last_ext->extLocation.partitionReferenceNum = 0; add = (1 << 30) - sb->s_blocksize; last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | add; /* Create enough extents to cover the whole hole */ while (new_block_bytes > add) { new_block_bytes -= add; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) goto out_err; iinfo->i_lenExtents += add; count++; } if (new_block_bytes) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | new_block_bytes; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) goto out_err; iinfo->i_lenExtents += new_block_bytes; count++; } out: /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) last_pos->offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) last_pos->offset -= sizeof(struct long_ad); else return -EIO; return count; out_err: /* Remove extents we've created so far */ udf_clear_extent_cache(inode); udf_truncate_extents(inode); return err; } /* Extend the final block of the file to final_block_len bytes */ static void udf_do_extend_final_block(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, uint32_t new_elen) { uint32_t added_bytes; /* * Extent already large enough? It may be already rounded up to block * size... */ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) return; added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); } static int udf_extend_file(struct inode *inode, loff_t newsize) { struct extent_position epos; struct kernel_lb_addr eloc; uint32_t elen; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = newsize >> sb->s_blocksize_bits, offset; loff_t new_elen; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); struct kernel_long_ad extent; int err = 0; bool within_last_ext; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); down_write(&iinfo->i_data_sem); /* * When creating hole in file, just don't bother with preserving * preallocation. It likely won't be very useful anyway. */ udf_discard_prealloc(inode); err = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset, &etype); if (err < 0) goto out; within_last_ext = (err == 1); /* We don't expect extents past EOF... */ WARN_ON_ONCE(within_last_ext && elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { /* File has no extents at all or has empty last * indirect extent! Create a fake extent... */ extent.extLocation.logicalBlockNum = 0; extent.extLocation.partitionReferenceNum = 0; extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; } else { epos.offset -= adsize; err = udf_next_aext(inode, &epos, &extent.extLocation, &extent.extLength, &etype, 0); if (err <= 0) goto out; extent.extLength |= etype << 30; } new_elen = ((loff_t)offset << inode->i_blkbits) | (newsize & (sb->s_blocksize - 1)); /* File has extent covering the new size (could happen when extending * inside a block)? */ if (within_last_ext) { /* Extending file within the last file block */ udf_do_extend_final_block(inode, &epos, &extent, new_elen); } else { err = udf_do_extend_file(inode, &epos, &extent, new_elen); } if (err < 0) goto out; err = 0; out: brelse(epos.bh); up_write(&iinfo->i_data_sem); return err; } static int inode_getblk(struct inode *inode, struct udf_map_rq *map) { struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; uint32_t elen = 0, tmpelen; struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; udf_pblk_t newblocknum; sector_t offset = 0; int8_t etype, tmpetype; struct udf_inode_info *iinfo = UDF_I(inode); udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF = false; int ret = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; cur_epos = next_epos = prev_epos; b_off = (loff_t)map->lblk << inode->i_sb->s_blocksize_bits; /* find the extent which contains the block we are looking for. alternate between laarr[0] and laarr[1] for locations of the current extent, and the previous extent */ do { if (prev_epos.bh != cur_epos.bh) { brelse(prev_epos.bh); get_bh(cur_epos.bh); prev_epos.bh = cur_epos.bh; } if (cur_epos.bh != next_epos.bh) { brelse(cur_epos.bh); get_bh(next_epos.bh); cur_epos.bh = next_epos.bh; } lbcount += elen; prev_epos.block = cur_epos.block; cur_epos.block = next_epos.block; prev_epos.offset = cur_epos.offset; cur_epos.offset = next_epos.offset; ret = udf_next_aext(inode, &next_epos, &eloc, &elen, &etype, 1); if (ret < 0) { goto out_free; } else if (ret == 0) { isBeyondEOF = true; break; } c = !c; laarr[c].extLength = (etype << 30) | elen; laarr[c].extLocation = eloc; if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) pgoal = eloc.logicalBlockNum + ((elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); count++; } while (lbcount + elen <= b_off); b_off -= lbcount; offset = b_off >> inode->i_sb->s_blocksize_bits; /* * Move prev_epos and cur_epos into indirect extent if we are at * the pointer to it */ ret = udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, &tmpetype, 0); if (ret < 0) goto out_free; ret = udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, &tmpetype, 0); if (ret < 0) goto out_free; /* if the extent is allocated and recorded, return the block if the extent is not a multiple of the blocksize, round up */ if (!isBeyondEOF && etype == (EXT_RECORDED_ALLOCATED >> 30)) { if (elen & (inode->i_sb->s_blocksize - 1)) { elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); iinfo->i_lenExtents = ALIGN(iinfo->i_lenExtents, inode->i_sb->s_blocksize); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } map->oflags = UDF_BLK_MAPPED; map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); goto out_free; } /* Are we beyond EOF and preallocated extent? */ if (isBeyondEOF) { loff_t hole_len; if (count) { if (c) laarr[0] = laarr[1]; startnum = 1; } else { /* Create a fake extent when there's not one */ memset(&laarr[0].extLocation, 0x00, sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; /* Will udf_do_extend_file() create real extent from a fake one? */ startnum = (offset > 0); } /* Create extents for the hole between EOF and offset */ hole_len = (loff_t)offset << inode->i_blkbits; ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); if (ret < 0) goto out_free; c = 0; offset = 0; count += ret; /* * Is there any real extent? - otherwise we overwrite the fake * one... */ if (count) c = !c; laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | inode->i_sb->s_blocksize; memset(&laarr[c].extLocation, 0x00, sizeof(struct kernel_lb_addr)); count++; endnum = c + 1; lastblock = 1; } else { endnum = startnum = ((count > 2) ? 2 : count); /* if the current extent is in position 0, swap it with the previous */ if (!c && count != 1) { laarr[2] = laarr[0]; laarr[0] = laarr[1]; laarr[1] = laarr[2]; c = 1; } /* if the current block is located in an extent, read the next extent */ ret = udf_next_aext(inode, &next_epos, &eloc, &elen, &etype, 0); if (ret > 0) { laarr[c + 1].extLength = (etype << 30) | elen; laarr[c + 1].extLocation = eloc; count++; startnum++; endnum++; } else if (ret == 0) lastblock = 1; else goto out_free; } /* if the current extent is not recorded but allocated, get the * block in the extent corresponding to the requested block */ if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) newblocknum = laarr[c].extLocation.logicalBlockNum + offset; else { /* otherwise, allocate a new block */ if (iinfo->i_next_alloc_block == map->lblk) goal = iinfo->i_next_alloc_goal; if (!goal) { if (!(goal = pgoal)) /* XXX: what was intended here? */ goal = iinfo->i_location.logicalBlockNum + 1; } newblocknum = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, goal, &ret); if (!newblocknum) goto out_free; if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; } /* if the extent the requsted block is located in contains multiple * blocks, split the extent into at most three extents. blocks prior * to requested block, requested block, and blocks after requested * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); if (!(map->iflags & UDF_MAP_NOPREALLOC)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); /* merge any continuous blocks in laarr */ udf_merge_extents(inode, laarr, &endnum); /* write back the new extents, inserting new extents if the new number * of extents is greater than the old number, and deleting extents if * the new number of extents is less than the old number */ ret = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); if (ret < 0) goto out_free; map->pblk = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); if (!map->pblk) { ret = -EFSCORRUPTED; goto out_free; } map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; iinfo->i_next_alloc_block = map->lblk + 1; iinfo->i_next_alloc_goal = newblocknum + 1; inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); ret = 0; out_free: brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); return ret; } static void udf_split_extents(struct inode *inode, int *c, int offset, udf_pblk_t newblocknum, struct kernel_long_ad *laarr, int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) || (laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { int curr = *c; int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits; int8_t etype = (laarr[curr].extLength >> 30); if (blen == 1) ; else if (!offset || blen == offset + 1) { laarr[curr + 2] = laarr[curr + 1]; laarr[curr + 1] = laarr[curr]; } else { laarr[curr + 3] = laarr[curr + 1]; laarr[curr + 2] = laarr[curr + 1] = laarr[curr]; } if (offset) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &laarr[curr].extLocation, 0, offset); laarr[curr].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (offset << blocksize_bits); laarr[curr].extLocation.logicalBlockNum = 0; laarr[curr].extLocation. partitionReferenceNum = 0; } else laarr[curr].extLength = (etype << 30) | (offset << blocksize_bits); curr++; (*c)++; (*endnum)++; } laarr[curr].extLocation.logicalBlockNum = newblocknum; if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) laarr[curr].extLocation.partitionReferenceNum = UDF_I(inode)->i_location.partitionReferenceNum; laarr[curr].extLength = EXT_RECORDED_ALLOCATED | blocksize; curr++; if (blen != offset + 1) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) laarr[curr].extLocation.logicalBlockNum += offset + 1; laarr[curr].extLength = (etype << 30) | ((blen - (offset + 1)) << blocksize_bits); curr++; (*endnum)++; } } } static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, struct kernel_long_ad *laarr, int *endnum) { int start, length = 0, currlength = 0, i; if (*endnum >= (c + 1)) { if (!lastblock) return; else start = c; } else { if ((laarr[c + 1].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { start = c + 1; length = currlength = (((laarr[c + 1].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else start = c; } for (i = start + 1; i <= *endnum; i++) { if (i == *endnum) { if (lastblock) length += UDF_DEFAULT_PREALLOC_BLOCKS; } else if ((laarr[i].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { length += (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else break; } if (length) { int next = laarr[start].extLocation.logicalBlockNum + (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); int numalloc = udf_prealloc_blocks(inode->i_sb, inode, laarr[start].extLocation.partitionReferenceNum, next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ? length : UDF_DEFAULT_PREALLOC_BLOCKS) - currlength); if (numalloc) { if (start == (c + 1)) laarr[start].extLength += (numalloc << inode->i_sb->s_blocksize_bits); else { memmove(&laarr[c + 2], &laarr[c + 1], sizeof(struct long_ad) * (*endnum - (c + 1))); (*endnum)++; laarr[c + 1].extLocation.logicalBlockNum = next; laarr[c + 1].extLocation.partitionReferenceNum = laarr[c].extLocation. partitionReferenceNum; laarr[c + 1].extLength = EXT_NOT_RECORDED_ALLOCATED | (numalloc << inode->i_sb->s_blocksize_bits); start = c + 1; } for (i = start + 1; numalloc && i < *endnum; i++) { int elen = ((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (elen > numalloc) { laarr[i].extLength -= (numalloc << inode->i_sb->s_blocksize_bits); numalloc = 0; } else { numalloc -= elen; if (*endnum > (i + 1)) memmove(&laarr[i], &laarr[i + 1], sizeof(struct long_ad) * (*endnum - (i + 1))); i--; (*endnum)--; } } UDF_I(inode)->i_lenExtents += numalloc << inode->i_sb->s_blocksize_bits; } } } static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, int *endnum) { int i; unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; for (i = 0; i < (*endnum - 1); i++) { struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; if (((li->extLength >> 30) == (lip1->extLength >> 30)) && (((li->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) || ((lip1->extLocation.logicalBlockNum - li->extLocation.logicalBlockNum) == (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits)))) { if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if (((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) && ((lip1->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { lip1->extLength = (lip1->extLength - (li->extLength & UDF_EXTENT_LENGTH_MASK) + UDF_EXTENT_LENGTH_MASK) & ~(blocksize - 1); li->extLength = (li->extLength & UDF_EXTENT_FLAG_MASK) + (UDF_EXTENT_LENGTH_MASK + 1) - blocksize; } else { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if ((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; li->extLength = (li->extLength & UDF_EXTENT_LENGTH_MASK) | EXT_NOT_RECORDED_NOT_ALLOCATED; } } } static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, int startnum, int endnum, struct extent_position *epos) { int start = 0, i; struct kernel_lb_addr tmploc; uint32_t tmplen; int8_t tmpetype; int err; if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { err = udf_insert_aext(inode, *epos, laarr[i].extLocation, laarr[i].extLength); /* * If we fail here, we are likely corrupting the extent * list and leaking blocks. At least stop early to * limit the damage. */ if (err < 0) return err; err = udf_next_aext(inode, epos, &laarr[i].extLocation, &laarr[i].extLength, &tmpetype, 1); if (err < 0) return err; start++; } } for (i = start; i < endnum; i++) { err = udf_next_aext(inode, epos, &tmploc, &tmplen, &tmpetype, 0); if (err < 0) return err; udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } return 0; } struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh = NULL; struct udf_map_rq map = { .lblk = block, .iflags = UDF_MAP_NOPREALLOC | (create ? UDF_MAP_CREATE : 0), }; *err = udf_map_block(inode, &map); if (*err || !(map.oflags & UDF_BLK_MAPPED)) return NULL; bh = sb_getblk(inode->i_sb, map.pblk); if (!bh) { *err = -ENOMEM; return NULL; } if (map.oflags & UDF_BLK_NEW) { lock_buffer(bh); memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); return bh; } if (bh_read(bh, 0) >= 0) return bh; brelse(bh); *err = -EIO; return NULL; } int udf_setsize(struct inode *inode, loff_t newsize) { int err = 0; struct udf_inode_info *iinfo; unsigned int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return -EINVAL; iinfo = UDF_I(inode); if (newsize > inode->i_size) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { if (bsize >= (udf_file_entry_alloc_offset(inode) + newsize)) { down_write(&iinfo->i_data_sem); iinfo->i_lenAlloc = newsize; up_write(&iinfo->i_data_sem); goto set_size; } err = udf_expand_file_adinicb(inode); if (err) return err; } err = udf_extend_file(inode, newsize); if (err) return err; set_size: truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); memset(iinfo->i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); iinfo->i_lenAlloc = newsize; truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); goto update_time; } err = block_truncate_page(inode->i_mapping, newsize, udf_get_block); if (err) return err; truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); if (err) return err; } update_time: inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); return err; } /* * Maximum length of linked list formed by ICB hierarchy. The chosen number is * arbitrary - just that we hopefully don't limit any real use of rewritten * inode on write-once media but avoid looping for too long on corrupted media. */ #define UDF_MAX_ICB_NESTING 1024 static int udf_read_inode(struct inode *inode, bool hidden_inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct kernel_lb_addr *iloc = &iinfo->i_location; unsigned int link_count; unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; int ret = -EIO; uint32_t uid, gid; struct timespec64 ts; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { udf_debug("partition reference: %u > logical volume partitions: %u\n", iloc->partitionReferenceNum, sbi->s_partitions); return -EIO; } if (iloc->logicalBlockNum >= sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { udf_debug("block=%u, partition=%u out of range\n", iloc->logicalBlockNum, iloc->partitionReferenceNum); return -EIO; } /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: * i_sb = sb * i_no = ino * i_flags = sb->s_flags * i_state = 0 * clean_inode(): zero fills and sets * i_count = 1 * i_nlink = 1 * i_op = NULL; */ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { udf_err(inode->i_sb, "(ino %lu) failed !bh\n", inode->i_ino); return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { udf_err(inode->i_sb, "(ino %lu) failed ident=%u\n", inode->i_ino, ident); goto out; } fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength) { brelse(ibh); memcpy(&iinfo->i_location, &loc, sizeof(struct kernel_lb_addr)); if (++indirections > UDF_MAX_ICB_NESTING) { udf_err(inode->i_sb, "too many ICBs in ICB hierarchy" " (max %d supported)\n", UDF_MAX_ICB_NESTING); goto out; } brelse(bh); goto reread; } } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { udf_err(inode->i_sb, "unsupported strategy type: %u\n", le16_to_cpu(fe->icbTag.strategyType)); goto out; } if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ iinfo->i_strat4096 = 1; iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { ret = -EIO; goto out; } iinfo->i_hidden = hidden_inode; iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; iinfo->i_lenAlloc = 0; iinfo->i_next_alloc_block = 0; iinfo->i_next_alloc_goal = 0; if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct extendedFileEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct extendedFileEntry), bs - sizeof(struct extendedFileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct fileEntry), bs - sizeof(struct fileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { iinfo->i_efe = 0; iinfo->i_use = 1; iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); ret = udf_alloc_i_data(inode, bs - sizeof(struct unallocSpaceEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct unallocSpaceEntry), bs - sizeof(struct unallocSpaceEntry)); return 0; } ret = -EIO; read_lock(&sbi->s_cred_lock); uid = le32_to_cpu(fe->uid); if (uid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; else i_uid_write(inode, uid); gid = le32_to_cpu(fe->gid); if (gid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) inode->i_gid = sbi->s_gid; else i_gid_write(inode, gid); if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_fmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_fmode; else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_dmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_dmode; else inode->i_mode = udf_convert_permissions(fe); inode->i_mode &= ~sbi->s_umask; iinfo->i_extraPerms = le32_to_cpu(fe->permissions) & ~FE_MAPPED_PERMS; read_unlock(&sbi->s_cred_lock); link_count = le16_to_cpu(fe->fileLinkCount); if (!link_count) { if (!hidden_inode) { ret = -ESTALE; goto out; } link_count = 1; } set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; if (iinfo->i_efe == 0) { inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); udf_disk_stamp_to_time(&ts, fe->accessTime); inode_set_atime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, fe->modificationTime); inode_set_mtime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, fe->attrTime); inode_set_ctime_to_ts(inode, ts); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); iinfo->i_streamdir = 0; iinfo->i_lenStreams = 0; } else { inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); udf_disk_stamp_to_time(&ts, efe->accessTime); inode_set_atime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, efe->modificationTime); inode_set_mtime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, efe->attrTime); inode_set_ctime_to_ts(inode, ts); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); /* Named streams */ iinfo->i_streamdir = (efe->streamDirectoryICB.extLength != 0); iinfo->i_locStreamdir = lelb_to_cpu(efe->streamDirectoryICB.extLocation); iinfo->i_lenStreams = le64_to_cpu(efe->objectSize); if (iinfo->i_lenStreams >= inode->i_size) iinfo->i_lenStreams -= inode->i_size; else iinfo->i_lenStreams = 0; } inode->i_generation = iinfo->i_unique; /* * Sanity check length of allocation descriptors and extended attrs to * avoid integer overflows */ if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs) goto out; /* Now do exact checks */ if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs) goto out; /* Sanity checks for files in ICB so that we don't get confused later */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { /* * For file in ICB data is stored in allocation descriptor * so sizes should match */ if (iinfo->i_lenAlloc != inode->i_size) goto out; /* File in ICB has to fit in there... */ if (inode->i_size > bs - udf_file_entry_alloc_offset(inode)) goto out; } switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; inode->i_mode |= S_IFDIR; inc_nlink(inode); break; case ICBTAG_FILE_TYPE_REALTIME: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: case ICBTAG_FILE_TYPE_VAT20: inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; inode->i_mode |= S_IFREG; break; case ICBTAG_FILE_TYPE_BLOCK: inode->i_mode |= S_IFBLK; break; case ICBTAG_FILE_TYPE_CHAR: inode->i_mode |= S_IFCHR; break; case ICBTAG_FILE_TYPE_FIFO: init_special_inode(inode, inode->i_mode | S_IFIFO, 0); break; case ICBTAG_FILE_TYPE_SOCKET: init_special_inode(inode, inode->i_mode | S_IFSOCK, 0); break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); inode->i_mode = S_IFLNK | 0777; break; case ICBTAG_FILE_TYPE_MAIN: udf_debug("METADATA FILE-----\n"); break; case ICBTAG_FILE_TYPE_MIRROR: udf_debug("METADATA MIRROR FILE-----\n"); break; case ICBTAG_FILE_TYPE_BITMAP: udf_debug("METADATA BITMAP FILE-----\n"); break; default: udf_err(inode->i_sb, "(ino %lu) failed unknown file type=%u\n", inode->i_ino, fe->icbTag.fileType); goto out; } if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (dsea) { init_special_inode(inode, inode->i_mode, MKDEV(le32_to_cpu(dsea->majorDeviceIdent), le32_to_cpu(dsea->minorDeviceIdent))); /* Developer ID ??? */ } else goto out; } ret = 0; out: brelse(bh); return ret; } static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); iinfo->i_data = kmalloc(size, GFP_KERNEL); if (!iinfo->i_data) return -ENOMEM; return 0; } static umode_t udf_convert_permissions(struct fileEntry *fe) { umode_t mode; uint32_t permissions; uint32_t flags; permissions = le32_to_cpu(fe->permissions); flags = le16_to_cpu(fe->icbTag.flags); mode = ((permissions) & 0007) | ((permissions >> 2) & 0070) | ((permissions >> 4) & 0700) | ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); return mode; } void udf_update_extra_perms(struct inode *inode, umode_t mode) { struct udf_inode_info *iinfo = UDF_I(inode); /* * UDF 2.01 sec. 3.3.3.3 Note 2: * In Unix, delete permission tracks write */ iinfo->i_extraPerms &= ~FE_DELETE_PERMS; if (mode & 0200) iinfo->i_extraPerms |= FE_PERM_U_DELETE; if (mode & 0020) iinfo->i_extraPerms |= FE_PERM_G_DELETE; if (mode & 0002) iinfo->i_extraPerms |= FE_PERM_O_DELETE; } int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } static int udf_sync_inode(struct inode *inode) { return udf_update_inode(inode, 1); } static void udf_adjust_time(struct udf_inode_info *iinfo, struct timespec64 time) { if (iinfo->i_crtime.tv_sec > time.tv_sec || (iinfo->i_crtime.tv_sec == time.tv_sec && iinfo->i_crtime.tv_nsec > time.tv_nsec)) iinfo->i_crtime = time; } static int udf_update_inode(struct inode *inode, int do_sync) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint64_t lb_recorded; uint32_t udfperms; uint16_t icbflags; uint16_t crclen; int err = 0; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); bh = sb_getblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); return -EIO; } lock_buffer(bh); memset(bh->b_data, 0, inode->i_sb->s_blocksize); fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (iinfo->i_use) { struct unallocSpaceEntry *use = (struct unallocSpaceEntry *)bh->b_data; use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); crclen = sizeof(struct unallocSpaceEntry); goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) fe->uid = cpu_to_le32(UDF_INVALID_ID); else fe->uid = cpu_to_le32(i_uid_read(inode)); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) fe->gid = cpu_to_le32(UDF_INVALID_ID); else fe->gid = cpu_to_le32(i_gid_read(inode)); udfperms = ((inode->i_mode & 0007)) | ((inode->i_mode & 0070) << 2) | ((inode->i_mode & 0700) << 4); udfperms |= iinfo->i_extraPerms; fe->permissions = cpu_to_le32(udfperms); if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); else { if (iinfo->i_hidden) fe->fileLinkCount = cpu_to_le16(0); else fe->fileLinkCount = cpu_to_le16(inode->i_nlink); } fe->informationLength = cpu_to_le64(inode->i_size); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct regid *eid; struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (!dsea) { dsea = (struct deviceSpec *) udf_add_extendedattr(inode, sizeof(struct deviceSpec) + sizeof(struct regid), 12, 0x3); dsea->attrType = cpu_to_le32(12); dsea->attrSubtype = 1; dsea->attrLength = cpu_to_le32( sizeof(struct deviceSpec) + sizeof(struct regid)); dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } eid = (struct regid *)dsea->impUse; memset(eid, 0, sizeof(*eid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; dsea->majorDeviceIdent = cpu_to_le32(imajor(inode)); dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) lb_recorded = 0; /* No extents => no blocks! */ else lb_recorded = (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> (blocksize_bits - 9); if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode)); udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode)); udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; fe->uniqueID = cpu_to_le64(iinfo->i_unique); fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); crclen = sizeof(struct fileEntry); } else { memcpy(bh->b_data + sizeof(struct extendedFileEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = cpu_to_le64(inode->i_size + iinfo->i_lenStreams); efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); if (iinfo->i_streamdir) { struct long_ad *icb_lad = &efe->streamDirectoryICB; icb_lad->extLocation = cpu_to_lelb(iinfo->i_locStreamdir); icb_lad->extLength = cpu_to_le32(inode->i_sb->s_blocksize); } udf_adjust_time(iinfo, inode_get_atime(inode)); udf_adjust_time(iinfo, inode_get_mtime(inode)); udf_adjust_time(iinfo, inode_get_ctime(inode)); udf_time_to_disk_stamp(&efe->accessTime, inode_get_atime(inode)); udf_time_to_disk_stamp(&efe->modificationTime, inode_get_mtime(inode)); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; efe->uniqueID = cpu_to_le64(iinfo->i_unique); efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); fe->icbTag.numEntries = cpu_to_le16(2); } else { fe->icbTag.strategyType = cpu_to_le16(4); fe->icbTag.numEntries = cpu_to_le16(1); } if (iinfo->i_use) fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; else if (S_ISLNK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK; else if (S_ISBLK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK; else if (S_ISCHR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR; else if (S_ISFIFO(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO; else if (S_ISSOCK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET; icbflags = iinfo->i_alloc_type | ((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) | ((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) | ((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) | (le16_to_cpu(fe->icbTag.flags) & ~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID | ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY)); fe->icbTag.flags = cpu_to_le16(icbflags); if (sbi->s_udfrev >= 0x0200) fe->descTag.descVersion = cpu_to_le16(3); else fe->descTag.descVersion = cpu_to_le16(2); fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); fe->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); set_buffer_uptodate(bh); unlock_buffer(bh); /* write the data blocks */ mark_buffer_dirty(bh); if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", inode->i_ino); err = -EIO; } } brelse(bh); return err; } struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, bool hidden_inode) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); int err; if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { if (UDF_I(inode)->i_hidden != hidden_inode) { iput(inode); return ERR_PTR(-EFSCORRUPTED); } return inode; } memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); err = udf_read_inode(inode, hidden_inode); if (err < 0) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; struct allocExtDesc *aed; struct extent_position nepos; struct kernel_lb_addr neloc; int ver, adsize; int err = 0; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; neloc.logicalBlockNum = block; neloc.partitionReferenceNum = epos->block.partitionReferenceNum; bh = sb_getblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); if (!bh) return -EIO; lock_buffer(bh); memset(bh->b_data, 0x00, sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); aed = (struct allocExtDesc *)(bh->b_data); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { aed->previousAllocExtLocation = cpu_to_le32(epos->block.logicalBlockNum); } aed->lengthAllocDescs = cpu_to_le32(0); if (UDF_SB(sb)->s_udfrev >= 0x0200) ver = 3; else ver = 2; udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, sizeof(struct tag)); nepos.block = neloc; nepos.offset = sizeof(struct allocExtDesc); nepos.bh = bh; /* * Do we have to copy current last extent to make space for indirect * one? */ if (epos->offset + adsize > sb->s_blocksize) { struct kernel_lb_addr cp_loc; uint32_t cp_len; int8_t cp_type; epos->offset -= adsize; err = udf_current_aext(inode, epos, &cp_loc, &cp_len, &cp_type, 0); if (err <= 0) goto err_out; cp_len |= ((uint32_t)cp_type) << 30; __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); udf_write_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } else { __udf_add_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } brelse(epos->bh); *epos = nepos; return 0; err_out: brelse(bh); return err; } /* * Append extent at the given position - should be the first free one in inode * / indirect extent. This function assumes there is enough space in the inode * or indirect extent. Use udf_add_aext() if you didn't check for this before. */ int __udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { struct udf_inode_info *iinfo = UDF_I(inode); struct allocExtDesc *aed; int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (!epos->bh) { WARN_ON(iinfo->i_lenAlloc != epos->offset - udf_file_entry_alloc_offset(inode)); } else { aed = (struct allocExtDesc *)epos->bh->b_data; WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != epos->offset - sizeof(struct allocExtDesc)); WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); if (!epos->bh) { iinfo->i_lenAlloc += adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)epos->bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(epos->bh->b_data, epos->offset + (inc ? 0 : adsize)); else udf_update_tag(epos->bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(epos->bh, inode); } return 0; } /* * Append extent at given position - should be the first free one in inode * / indirect extent. Takes care of allocating and linking indirect blocks. */ int udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; struct super_block *sb = inode->i_sb; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (epos->offset + (2 * adsize) > sb->s_blocksize) { int err; udf_pblk_t new_block; new_block = udf_new_block(sb, NULL, epos->block.partitionReferenceNum, epos->block.logicalBlockNum, &err); if (!new_block) return -ENOSPC; err = udf_setup_indirect_aext(inode, new_block, epos); if (err) return err; } return __udf_add_aext(inode, epos, eloc, elen, inc); } void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; else ptr = epos->bh->b_data + epos->offset; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = (struct short_ad *)ptr; sad->extLength = cpu_to_le32(elen); sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); adsize = sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: lad = (struct long_ad *)ptr; lad->extLength = cpu_to_le32(elen); lad->extLocation = cpu_to_lelb(*eloc); memset(lad->impUse, 0x00, sizeof(lad->impUse)); adsize = sizeof(struct long_ad); break; default: return; } if (epos->bh) { if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) { struct allocExtDesc *aed = (struct allocExtDesc *)epos->bh->b_data; udf_update_tag(epos->bh->b_data, le32_to_cpu(aed->lengthAllocDescs) + sizeof(struct allocExtDesc)); } mark_buffer_dirty_inode(epos->bh, inode); } else { mark_inode_dirty(inode); } if (inc) epos->offset += adsize; } /* * Only 1 indirect extent in a row really makes sense but allow upto 16 in case * someone does some weird stuff. */ #define UDF_MAX_INDIR_EXTS 16 /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int udf_next_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int8_t *etype, int inc) { unsigned int indirections = 0; int ret = 0; udf_pblk_t block; while (1) { ret = udf_current_aext(inode, epos, eloc, elen, etype, inc); if (ret <= 0) return ret; if (*etype != (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) return ret; if (++indirections > UDF_MAX_INDIR_EXTS) { udf_err(inode->i_sb, "too many indirect extents in inode %lu\n", inode->i_ino); return -EFSCORRUPTED; } epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); epos->bh = sb_bread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %u failed!\n", block); return -EIO; } } } /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int udf_current_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int8_t *etype, int inc) { int alen; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) { if (!epos->offset) epos->offset = udf_file_entry_alloc_offset(inode); ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; alen = udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc; } else { struct allocExtDesc *header = (struct allocExtDesc *)epos->bh->b_data; if (!epos->offset) epos->offset = sizeof(struct allocExtDesc); ptr = epos->bh->b_data + epos->offset; if (check_add_overflow(sizeof(struct allocExtDesc), le32_to_cpu(header->lengthAllocDescs), &alen)) return -1; } switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc); if (!sad) return 0; *etype = le32_to_cpu(sad->extLength) >> 30; eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); eloc->partitionReferenceNum = iinfo->i_location.partitionReferenceNum; *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; break; case ICBTAG_FLAG_AD_LONG: lad = udf_get_filelongad(ptr, alen, &epos->offset, inc); if (!lad) return 0; *etype = le32_to_cpu(lad->extLength) >> 30; *eloc = lelb_to_cpu(lad->extLocation); *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: udf_debug("alloc_type = %u unsupported\n", iinfo->i_alloc_type); return -EINVAL; } return 1; } static int udf_insert_aext(struct inode *inode, struct extent_position epos, struct kernel_lb_addr neloc, uint32_t nelen) { struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; int ret; if (epos.bh) get_bh(epos.bh); while (1) { ret = udf_next_aext(inode, &epos, &oeloc, &oelen, &etype, 0); if (ret <= 0) break; udf_write_aext(inode, &epos, &neloc, nelen, 1); neloc = oeloc; nelen = (etype << 30) | oelen; } if (ret == 0) ret = udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); return ret; } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) { struct extent_position oepos; int adsize; int8_t etype; struct allocExtDesc *aed; struct udf_inode_info *iinfo; struct kernel_lb_addr eloc; uint32_t elen; int ret; if (epos.bh) { get_bh(epos.bh); get_bh(epos.bh); } iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else adsize = 0; oepos = epos; if (udf_next_aext(inode, &epos, &eloc, &elen, &etype, 1) <= 0) return -1; while (1) { ret = udf_next_aext(inode, &epos, &eloc, &elen, &etype, 1); if (ret < 0) { brelse(epos.bh); brelse(oepos.bh); return -1; } if (ret == 0) break; udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); if (oepos.bh != epos.bh) { oepos.block = epos.block; brelse(oepos.bh); get_bh(epos.bh); oepos.bh = epos.bh; oepos.offset = epos.offset - adsize; } } memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); elen = 0; if (epos.bh != oepos.bh) { udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= (adsize * 2); mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize)); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, oepos.offset - (2 * adsize)); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } else { udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, epos.offset - adsize); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } brelse(epos.bh); brelse(oepos.bh); return (elen >> 30); } /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int inode_bmap(struct inode *inode, sector_t block, struct extent_position *pos, struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset, int8_t *etype) { unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; loff_t lbcount = 0, bcount = (loff_t) block << blocksize_bits; struct udf_inode_info *iinfo; int err = 0; iinfo = UDF_I(inode); if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { pos->offset = 0; pos->block = iinfo->i_location; pos->bh = NULL; } *elen = 0; do { err = udf_next_aext(inode, pos, eloc, elen, etype, 1); if (err <= 0) { if (err == 0) { *offset = (bcount - lbcount) >> blocksize_bits; iinfo->i_lenExtents = lbcount; } return err; } lbcount += *elen; } while (lbcount <= bcount); /* update extent cache */ udf_update_extent_cache(inode, lbcount - *elen, pos); *offset = (bcount + *elen - lbcount) >> blocksize_bits; return 1; }
89 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/tcp.h> #include <linux/vmalloc.h> #include <net/request_sock.h> /* * Maximum number of SYN_RECV sockets in queue per LISTEN socket. * One SYN_RECV socket costs about 80bytes on a 32bit machine. * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. * * The minimum value of it is 128. Experiments with real servers show that * it is absolutely not enough even at 100conn/sec. 256 cures most * of problems. * This value is adjusted to 128 for low memory machines, * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ void reqsk_queue_alloc(struct request_sock_queue *queue) { queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; queue->rskq_accept_head = NULL; } /* * This function is called to set a Fast Open socket's "fastopen_rsk" field * to NULL when a TFO socket no longer needs to access the request_sock. * This happens only after 3WHS has been either completed or aborted (e.g., * RST is received). * * Before TFO, a child socket is created only after 3WHS is completed, * hence it never needs to access the request_sock. things get a lot more * complex with TFO. A child socket, accepted or not, has to access its * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, * until 3WHS is either completed or aborted. Afterwards the req will stay * until either the child socket is accepted, or in the rare case when the * listener is closed before the child is accepted. * * In short, a request socket is only freed after BOTH 3WHS has completed * (or aborted) and the child socket has been accepted (or listener closed). * When a child socket is accepted, its corresponding req->sk is set to * NULL since it's no longer needed. More importantly, "req->sk == NULL" * will be used by the code below to determine if a child socket has been * accepted or not, and the check is protected by the fastopenq->lock * described below. * * Note that fastopen_rsk is only accessed from the child socket's context * with its socket lock held. But a request_sock (req) can be accessed by * both its child socket through fastopen_rsk, and a listener socket through * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. * only in the rare case when both the listener and the child locks are held, * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. * The lock also protects other fields such as fastopenq->qlen, which is * decremented by this function when fastopen_rsk is no longer needed. * * Note that another solution was to simply use the existing socket lock * from the listener. But first socket lock is difficult to use. It is not * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to * acquire a child's lock while holding listener's socket lock. A corner * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; if (!reset || lsk->sk_state != TCP_LISTEN) { /* If the listener has been closed don't bother with the * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. * This is a simple defense against TFO spoofing attack - by * counting the req against fastopen.max_qlen, and disabling * TFO when the qlen exceeds max_qlen. * * For more details see CoNext'11 "TCP Fast Open" paper. */ req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else fastopenq->rskq_rst_tail->dl_next = req; req->dl_next = NULL; fastopenq->rskq_rst_tail = req; fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); }
7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_INCLUDE_PATH ../../drivers/dma-buf #define TRACE_SYSTEM sync_trace #if !defined(_TRACE_SYNC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SYNC_H #include "sync_debug.h" #include <linux/tracepoint.h> TRACE_EVENT(sync_timeline, TP_PROTO(struct sync_timeline *timeline), TP_ARGS(timeline), TP_STRUCT__entry( __string(name, timeline->name) __field(u32, value) ), TP_fast_assign( __assign_str(name); __entry->value = timeline->value; ), TP_printk("name=%s value=%d", __get_str(name), __entry->value) ); #endif /* if !defined(_TRACE_SYNC_H) || defined(TRACE_HEADER_MULTI_READ) */ /* This part must be outside protection */ #include <trace/define_trace.h>
2 2 2 7 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 /* SPDX-License-Identifier: GPL-2.0 */ /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #ifndef _LINUX_XSK_QUEUE_H #define _LINUX_XSK_QUEUE_H #include <linux/types.h> #include <linux/if_xdp.h> #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #include "xsk.h" struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; u32 pad2 ____cacheline_aligned_in_smp; u32 flags; u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ struct xdp_rxtx_ring { struct xdp_ring ptrs; struct xdp_desc desc[] ____cacheline_aligned_in_smp; }; /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; u64 desc[] ____cacheline_aligned_in_smp; }; struct xsk_queue { u32 ring_mask; u32 nentries; u32 cached_prod; u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; u64 queue_empty_descs; size_t ring_vmalloc_size; }; struct parsed_desc { u32 mb; u32 valid; }; /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and * completion ring, the kernel is the producer and user space is the * consumer. For the Tx and fill rings, the kernel is the consumer and * user space is the producer. * * producer consumer * * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) * STORE $data LOAD $data * STORE.rel ->producer (B) STORE.rel ->consumer (D) * } * * (A) pairs with (D), and (B) pairs with (C). * * Starting with (B), it protects the data from being written after * the producer pointer. If this barrier was missing, the consumer * could observe the producer pointer being set and thus load the data * before the producer has written the new data. The consumer would in * this case load the old data. * * (C) protects the consumer from speculatively loading the data before * the producer pointer actually has been read. If we do not have this * barrier, some architectures could load old data as speculative loads * are not discarded as the CPU does not know there is a dependency * between ->producer and data. * * (A) is a control dependency that separates the load of ->consumer * from the stores of $data. In case ->consumer indicates there is no * room in the buffer to store $data we do not. The dependency will * order both of the stores after the loads. So no barrier is needed. * * (D) protects the load of the data to be observed to happen after the * store of the consumer pointer. If we did not have this memory * barrier, the producer could observe the consumer pointer being set * and overwrite the data with a new value before the consumer got the * chance to read the old value. The consumer would thus miss reading * the old entry and very likely read the new entry twice, once right * now and again after circling through the ring. */ /* The operations on the rings are the following: * * producer consumer * * RESERVE entries PEEK in the ring for entries * WRITE data into the ring READ data from the ring * SUBMIT entries RELEASE entries * * The producer reserves one or more entries in the ring. It can then * fill in these entries and finally submit them so that they can be * seen and read by the consumer. * * The consumer peeks into the ring to see if the producer has written * any new entries. If so, the consumer can then read these entries * and when it is done reading them release them back to the producer * so that the producer can use these slots to fill in new entries. * * The function names below reflect these operations. */ /* Functions that read and validate content from consumer rings. */ static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; *addr = ring->desc[idx]; } static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_cons != q->cached_prod) { __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); return true; } return false; } static inline bool xp_unused_options_set(u32 options) { return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); } static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = desc->addr - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; u64 offset = addr & (pool->chunk_size - 1); if (!desc->len) return false; if (offset + len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; if (!desc->len) return false; if (len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : xp_aligned_validate_desc(pool, desc); } static inline bool xskq_has_descs(struct xsk_queue *q) { return q->cached_cons != q->cached_prod; } static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) { if (!xp_validate_desc(pool, d)) { q->invalid_descs++; return false; } return true; } static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; return xskq_cons_is_valid_desc(q, desc, pool); } q->queue_empty_descs++; return false; } static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) { q->cached_cons += cnt; } static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, struct xdp_desc *desc, struct parsed_desc *parsed) { parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); parsed->mb = xp_mb_desc(desc); } static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; u32 total_descs = 0, nr_frags = 0; /* track first entry, if stumble upon *any* invalid descriptor, rewind * current packet that consists of frags and stop the processing */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; cached_cons++; parse_desc(q, pool, &descs[nb_entries], &parsed); if (unlikely(!parsed.valid)) break; if (likely(!parsed.mb)) { total_descs += (nr_frags + 1); nr_frags = 0; } else { nr_frags++; if (nr_frags == pool->xdp_zc_max_segs) { nr_frags = 0; break; } } nb_entries++; } cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); return total_descs; } /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) { smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ } static inline void __xskq_cons_peek(struct xsk_queue *q) { /* Refresh the local pointer */ q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ } static inline void xskq_cons_get_entries(struct xsk_queue *q) { __xskq_cons_release(q); __xskq_cons_peek(q); } static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; if (entries >= max) return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; return entries >= max ? max : entries; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_addr_unchecked(q, addr); } static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_desc(q, desc, pool); } /* To improve performance in the xskq_cons_release functions, only update local state here. * Reflect this to global state when we get new entries from the ring in * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. */ static inline void xskq_cons_release(struct xsk_queue *q) { q->cached_cons++; } static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_cons -= cnt; } static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); } /* Functions for producers */ static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); if (free_entries >= max) return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); return free_entries >= max ? max : free_entries; } static inline bool xskq_prod_is_full(struct xsk_queue *q) { return xskq_prod_nb_free(q, 1) ? false : true; } static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) { if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ q->cached_prod++; return 0; } static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ ring->desc[q->cached_prod++ & q->ring_mask] = addr; return 0; } static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 i, cached_prod; /* A, matches D */ cached_prod = q->cached_prod; for (i = 0; i < nb_entries; i++) ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; q->cached_prod = cached_prod; } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; if (xskq_prod_is_full(q)) return -ENOBUFS; /* A, matches D */ idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; ring->desc[idx].options = flags; return 0; } static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { smp_store_release(&q->ring->producer, idx); /* B, matches C */ } static inline void xskq_prod_submit(struct xsk_queue *q) { __xskq_prod_submit(q, q->cached_prod); } static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); } static inline bool xskq_prod_is_empty(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } /* For both producers and consumers */ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) { return q ? q->invalid_descs : 0; } static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) { return q ? q->queue_empty_descs : 0; } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */
19 1 5 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/file.c * * Copyright (C) 1991, 1992 Linus Torvalds * * minix regular file handling primitives */ #include "minix.h" /* * We have mostly NULLs here: the current defaults are OK for * the minix filesystem. */ const struct file_operations minix_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, .splice_read = filemap_splice_read, }; static int minix_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; truncate_setsize(inode, attr->ia_size); minix_truncate(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } const struct inode_operations minix_file_inode_operations = { .setattr = minix_setattr, .getattr = minix_getattr, };
1 466 6 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ #include "bat_v.h" #include "main.h" #include <linux/atomic.h> #include <linux/cache.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if) { batadv_v_elp_iface_activate(primary_if, hard_iface); batadv_hardif_put(primary_if); } /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can * set the interface as ACTIVE right away, without any risk of race * condition */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; } static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) { int ret; ret = batadv_v_elp_iface_enable(hard_iface); if (ret < 0) return ret; ret = batadv_v_ogm_iface_enable(hard_iface); if (ret < 0) batadv_v_elp_iface_disable(hard_iface); return ret; } static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) { batadv_v_ogm_iface_disable(hard_iface); batadv_v_elp_iface_disable(hard_iface); } static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) { batadv_v_elp_primary_iface_set(hard_iface); batadv_v_ogm_primary_iface_set(hard_iface); } /** * batadv_v_iface_update_mac() - react to hard-interface MAC address change * @hard_iface: the modified interface * * If the modified interface is the primary one, update the originator * address in the ELP and OGM messages to reflect the new MAC address. */ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if != hard_iface) goto out; batadv_v_primary_iface_set(hard_iface); out: batadv_hardif_put(primary_if); } static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); } /** * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to dump * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; u32 throughput; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); throughput = throughput * 100; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_neigh_dump_hardif() - Dump the neighbours of a hard interface into * a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @hard_iface: The hard interface to be dumped * @idx_s: Entries to be skipped * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_v_neigh_dump() - Dump the neighbours of a hard interface into a * message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @single_hardif: Limit dumping to this hard interface */ static void batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (i_hardif++ < i_hardif_s) continue; if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { struct batadv_neigh_ifinfo *n_ifinfo; unsigned int last_seen_msecs; u32 throughput; void *hdr; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return 0; throughput = n_ifinfo->bat_v.throughput * 100; batadv_neigh_ifinfo_put(n_ifinfo); last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_v_orig_dump_bucket() - Dump an originator bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_v_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_v_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; int ret = 0; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; u32 threshold; bool ret = false; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; threshold = ifinfo1->bat_v.throughput / 4; threshold = ifinfo1->bat_v.throughput - threshold; ret = ifinfo2->bat_v.throughput > threshold; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } /** * batadv_v_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) { /* set default throughput difference threshold to 5Mbps */ atomic_set(&bat_priv->gw.sel_class, 50); } /** * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for * @bw: the pointer where the metric will be stored. The metric is computed as * the minimum between the GW advertised throughput and the path throughput to * it in the mesh * * Return: 0 on success, -1 on failure */ static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_orig_node *orig_node; struct batadv_neigh_node *router; int ret = -1; orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; /* the GW metric is computed as the minimum between the path throughput * to reach the GW itself and the advertised bandwidth. * This gives us an approximation of the effective throughput that the * client can expect via this particular GW node */ *bw = router_ifinfo->bat_v.throughput; *bw = min_t(u32, *bw, gw_node->bandwidth_down); ret = 0; out: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); return ret; } /** * batadv_v_gw_get_best_gw_node() - retrieve the best GW node * @bat_priv: the bat priv with all the soft interface information * * Return: the GW node having the best GW-metric, NULL if no GW is known */ static struct batadv_gw_node * batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node, *curr_gw = NULL; u32 max_bw = 0, bw; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { if (!kref_get_unless_zero(&gw_node->refcount)) continue; if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) goto next; if (curr_gw && bw <= max_bw) goto next; batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); max_bw = bw; next: batadv_gw_node_put(gw_node); } rcu_read_unlock(); return curr_gw; } /** * batadv_v_gw_is_eligible() - check if a originator would be selected as GW * @bat_priv: the bat priv with all the soft interface information * @curr_gw_orig: originator representing the currently selected GW * @orig_node: the originator representing the new candidate * * Return: true if orig_node can be selected as current GW, false otherwise */ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_gw_node *curr_gw, *orig_gw = NULL; u32 gw_throughput, orig_throughput, threshold; bool ret = false; threshold = atomic_read(&bat_priv->gw.sel_class); curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig); if (!curr_gw) { ret = true; goto out; } if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) { ret = true; goto out; } orig_gw = batadv_gw_node_get(bat_priv, orig_node); if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) goto out; if (orig_throughput < gw_throughput) goto out; if ((orig_throughput - gw_throughput) < threshold) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n", gw_throughput, orig_throughput); ret = true; out: batadv_gw_node_put(curr_gw); batadv_gw_node_put(orig_gw); return ret; } /** * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) { if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, router_ifinfo->bat_v.throughput)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_v_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information */ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", .iface = { .activate = batadv_v_iface_activate, .enable = batadv_v_iface_enable, .disable = batadv_v_iface_disable, .update_mac = batadv_v_iface_update_mac, .primary_set = batadv_v_primary_iface_set, }, .neigh = { .hardif_init = batadv_v_hardif_neigh_init, .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, .dump = batadv_v_neigh_dump, }, .orig = { .dump = batadv_v_orig_dump, }, .gw = { .init_sel_class = batadv_v_init_sel_class, .sel_class_max = U32_MAX, .get_best_gw_node = batadv_v_gw_get_best_gw_node, .is_eligible = batadv_v_gw_is_eligible, .dump = batadv_v_gw_dump, }, }; /** * batadv_v_hardif_init() - initialize the algorithm specific fields in the * hard-interface object * @hard_iface: the hard-interface to initialize */ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) { /* enable link throughput auto-detection by setting the throughput * override to zero */ atomic_set(&hard_iface->bat_v.throughput_override, 0); atomic_set(&hard_iface->bat_v.elp_interval, 500); hard_iface->bat_v.aggr_len = 0; skb_queue_head_init(&hard_iface->bat_v.aggr_list); INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, batadv_v_ogm_aggr_work); } /** * batadv_v_mesh_init() - initialize the B.A.T.M.A.N. V private resources for a * mesh * @bat_priv: the object representing the mesh interface to initialise * * Return: 0 on success or a negative error code otherwise */ int batadv_v_mesh_init(struct batadv_priv *bat_priv) { int ret = 0; ret = batadv_v_ogm_init(bat_priv); if (ret < 0) return ret; return 0; } /** * batadv_v_mesh_free() - free the B.A.T.M.A.N. V private resources for a mesh * @bat_priv: the object representing the mesh interface to free */ void batadv_v_mesh_free(struct batadv_priv *bat_priv) { batadv_v_ogm_free(bat_priv); } /** * batadv_v_init() - B.A.T.M.A.N. V initialization function * * Description: Takes care of initializing all the subcomponents. * It is invoked upon module load only. * * Return: 0 on success or a negative error code otherwise */ int __init batadv_v_init(void) { int ret; /* B.A.T.M.A.N. V echo location protocol packet */ ret = batadv_recv_handler_register(BATADV_ELP, batadv_v_elp_packet_recv); if (ret < 0) return ret; ret = batadv_recv_handler_register(BATADV_OGM2, batadv_v_ogm_packet_recv); if (ret < 0) goto elp_unregister; ret = batadv_algo_register(&batadv_batman_v); if (ret < 0) goto ogm_unregister; return ret; ogm_unregister: batadv_recv_handler_unregister(BATADV_OGM2); elp_unregister: batadv_recv_handler_unregister(BATADV_ELP); return ret; }
87 87 19 14 5 19 47 41 31 15 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 // SPDX-License-Identifier: GPL-2.0 /* * drivers/usb/core/generic.c - generic driver for USB devices (not interfaces) * * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de> * * based on drivers/usb/usb.c which had the following copyrights: * (C) Copyright Linus Torvalds 1999 * (C) Copyright Johannes Erdfelt 1999-2001 * (C) Copyright Andreas Gal 1999 * (C) Copyright Gregory P. Smith 1999 * (C) Copyright Deti Fliegl 1999 (new USB architecture) * (C) Copyright Randy Dunlap 2000 * (C) Copyright David Brownell 2000-2004 * (C) Copyright Yggdrasil Computing, Inc. 2000 * (usb_device_id matching changes by Adam J. Richter) * (C) Copyright Greg Kroah-Hartman 2002-2003 * * Released under the GPLv2 only. */ #include <linux/usb.h> #include <linux/usb/hcd.h> #include <linux/string_choices.h> #include <uapi/linux/usb/audio.h> #include "usb.h" static int is_rndis(struct usb_interface_descriptor *desc) { return desc->bInterfaceClass == USB_CLASS_COMM && desc->bInterfaceSubClass == 2 && desc->bInterfaceProtocol == 0xff; } static int is_activesync(struct usb_interface_descriptor *desc) { return desc->bInterfaceClass == USB_CLASS_MISC && desc->bInterfaceSubClass == 1 && desc->bInterfaceProtocol == 1; } static bool is_audio(struct usb_interface_descriptor *desc) { return desc->bInterfaceClass == USB_CLASS_AUDIO; } static bool is_uac3_config(struct usb_interface_descriptor *desc) { return desc->bInterfaceProtocol == UAC_VERSION_3; } int usb_choose_configuration(struct usb_device *udev) { int i; int num_configs; int insufficient_power = 0; struct usb_host_config *c, *best; struct usb_device_driver *udriver; /* * If a USB device (not an interface) doesn't have a driver then the * kernel has no business trying to select or install a configuration * for it. */ if (!udev->dev.driver) return -1; udriver = to_usb_device_driver(udev->dev.driver); if (usb_device_is_owned(udev)) return 0; if (udriver->choose_configuration) { i = udriver->choose_configuration(udev); if (i >= 0) return i; } best = NULL; c = udev->config; num_configs = udev->descriptor.bNumConfigurations; for (i = 0; i < num_configs; (i++, c++)) { struct usb_interface_descriptor *desc = NULL; /* It's possible that a config has no interfaces! */ if (c->desc.bNumInterfaces > 0) desc = &c->intf_cache[0]->altsetting->desc; /* * HP's USB bus-powered keyboard has only one configuration * and it claims to be self-powered; other devices may have * similar errors in their descriptors. If the next test * were allowed to execute, such configurations would always * be rejected and the devices would not work as expected. * In the meantime, we run the risk of selecting a config * that requires external power at a time when that power * isn't available. It seems to be the lesser of two evils. * * Bugzilla #6448 reports a device that appears to crash * when it receives a GET_DEVICE_STATUS request! We don't * have any other way to tell whether a device is self-powered, * but since we don't use that information anywhere but here, * the call has been removed. * * Maybe the GET_DEVICE_STATUS call and the test below can * be reinstated when device firmwares become more reliable. * Don't hold your breath. */ #if 0 /* Rule out self-powered configs for a bus-powered device */ if (bus_powered && (c->desc.bmAttributes & USB_CONFIG_ATT_SELFPOWER)) continue; #endif /* * The next test may not be as effective as it should be. * Some hubs have errors in their descriptor, claiming * to be self-powered when they are really bus-powered. * We will overestimate the amount of current such hubs * make available for each port. * * This is a fairly benign sort of failure. It won't * cause us to reject configurations that we should have * accepted. */ /* Rule out configs that draw too much bus current */ if (usb_get_max_power(udev, c) > udev->bus_mA) { insufficient_power++; continue; } /* * Select first configuration as default for audio so that * devices that don't comply with UAC3 protocol are supported. * But, still iterate through other configurations and * select UAC3 compliant config if present. */ if (desc && is_audio(desc)) { /* Always prefer the first found UAC3 config */ if (is_uac3_config(desc)) { best = c; break; } /* If there is no UAC3 config, prefer the first config */ else if (i == 0) best = c; /* Unconditional continue, because the rest of the code * in the loop is irrelevant for audio devices, and * because it can reassign best, which for audio devices * we don't want. */ continue; } /* When the first config's first interface is one of Microsoft's * pet nonstandard Ethernet-over-USB protocols, ignore it unless * this kernel has enabled the necessary host side driver. * But: Don't ignore it if it's the only config. */ if (i == 0 && num_configs > 1 && desc && (is_rndis(desc) || is_activesync(desc))) { #if !defined(CONFIG_USB_NET_RNDIS_HOST) && !defined(CONFIG_USB_NET_RNDIS_HOST_MODULE) continue; #else best = c; #endif } /* From the remaining configs, choose the first one whose * first interface is for a non-vendor-specific class. * Reason: Linux is more likely to have a class driver * than a vendor-specific driver. */ else if (udev->descriptor.bDeviceClass != USB_CLASS_VENDOR_SPEC && (desc && desc->bInterfaceClass != USB_CLASS_VENDOR_SPEC)) { best = c; break; } /* If all the remaining configs are vendor-specific, * choose the first one. */ else if (!best) best = c; } if (insufficient_power > 0) dev_info(&udev->dev, "rejected %d configuration%s " "due to insufficient available bus power\n", insufficient_power, str_plural(insufficient_power)); if (best) { i = best->desc.bConfigurationValue; dev_dbg(&udev->dev, "configuration #%d chosen from %d choice%s\n", i, num_configs, str_plural(num_configs)); } else { i = -1; dev_warn(&udev->dev, "no configuration chosen from %d choice%s\n", num_configs, str_plural(num_configs)); } return i; } EXPORT_SYMBOL_GPL(usb_choose_configuration); static int __check_for_non_generic_match(struct device_driver *drv, void *data) { struct usb_device *udev = data; struct usb_device_driver *udrv; if (!is_usb_device_driver(drv)) return 0; udrv = to_usb_device_driver(drv); if (udrv == &usb_generic_driver) return 0; return usb_driver_applicable(udev, udrv); } static bool usb_generic_driver_match(struct usb_device *udev) { if (udev->use_generic_driver) return true; /* * If any other driver wants the device, leave the device to this other * driver. */ if (bus_for_each_drv(&usb_bus_type, NULL, udev, __check_for_non_generic_match)) return false; return true; } int usb_generic_driver_probe(struct usb_device *udev) { int err, c; /* Choose and set the configuration. This registers the interfaces * with the driver core and lets interface drivers bind to them. */ if (udev->authorized == 0) dev_err(&udev->dev, "Device is not authorized for usage\n"); else { c = usb_choose_configuration(udev); if (c >= 0) { err = usb_set_configuration(udev, c); if (err && err != -ENODEV) { dev_err(&udev->dev, "can't set config #%d, error %d\n", c, err); /* This need not be fatal. The user can try to * set other configurations. */ } } } /* USB device state == configured ... usable */ usb_notify_add_device(udev); return 0; } void usb_generic_driver_disconnect(struct usb_device *udev) { usb_notify_remove_device(udev); /* if this is only an unbind, not a physical disconnect, then * unconfigure the device */ if (udev->actconfig) usb_set_configuration(udev, -1); } #ifdef CONFIG_PM int usb_generic_driver_suspend(struct usb_device *udev, pm_message_t msg) { int rc; /* Normal USB devices suspend through their upstream port. * Root hubs don't have upstream ports to suspend, * so we have to shut down their downstream HC-to-USB * interfaces manually by doing a bus (or "global") suspend. */ if (!udev->parent) rc = hcd_bus_suspend(udev, msg); /* * Non-root USB2 devices don't need to do anything for FREEZE * or PRETHAW. USB3 devices don't support global suspend and * needs to be selectively suspended. */ else if ((msg.event == PM_EVENT_FREEZE || msg.event == PM_EVENT_PRETHAW) && (udev->speed < USB_SPEED_SUPER)) rc = 0; else rc = usb_port_suspend(udev, msg); if (rc == 0) usbfs_notify_suspend(udev); return rc; } int usb_generic_driver_resume(struct usb_device *udev, pm_message_t msg) { int rc; /* Normal USB devices resume/reset through their upstream port. * Root hubs don't have upstream ports to resume or reset, * so we have to start up their downstream HC-to-USB * interfaces manually by doing a bus (or "global") resume. */ if (!udev->parent) rc = hcd_bus_resume(udev, msg); else rc = usb_port_resume(udev, msg); if (rc == 0) usbfs_notify_resume(udev); return rc; } #endif /* CONFIG_PM */ struct usb_device_driver usb_generic_driver = { .name = "usb", .match = usb_generic_driver_match, .probe = usb_generic_driver_probe, .disconnect = usb_generic_driver_disconnect, #ifdef CONFIG_PM .suspend = usb_generic_driver_suspend, .resume = usb_generic_driver_resume, #endif .supports_autosuspend = 1, };
12 12 4 4 4 12 5 357 345 12 7 5 6 369 367 13 19 18 1 14 1 2 2 2 2 2 118 113 5 4 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "eytzinger.h" #include "journal.h" #include "journal_seq_blacklist.h" #include "super-io.h" /* * journal_seq_blacklist machinery: * * To guarantee order of btree updates after a crash, we need to detect when a * btree node entry (bset) is newer than the newest journal entry that was * successfully written, and ignore it - effectively ignoring any btree updates * that didn't make it into the journal. * * If we didn't do this, we might have two btree nodes, a and b, both with * updates that weren't written to the journal yet: if b was updated after a, * but b was flushed and not a - oops; on recovery we'll find that the updates * to b happened, but not the updates to a that happened before it. * * Ignoring bsets that are newer than the newest journal entry is always safe, * because everything they contain will also have been journalled - and must * still be present in the journal on disk until a journal entry has been * written _after_ that bset was written. * * To accomplish this, bsets record the newest journal sequence number they * contain updates for; then, on startup, the btree code queries the journal * code to ask "Is this sequence number newer than the newest journal entry? If * so, ignore it." * * When this happens, we must blacklist that journal sequence number: the * journal must not write any entries with that sequence number, and it must * record that it was blacklisted so that a) on recovery we don't think we have * missing journal entries and b) so that the btree code continues to ignore * that bset, until that btree node is rewritten. */ static unsigned sb_blacklist_u64s(unsigned nr) { struct bch_sb_field_journal_seq_blacklist *bl; return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); } int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; unsigned i = 0, nr; int ret = 0; mutex_lock(&c->sb_lock); bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); while (i < nr) { struct journal_seq_blacklist_entry *e = bl->start + i; if (end < le64_to_cpu(e->start)) break; if (start > le64_to_cpu(e->end)) { i++; continue; } /* * Entry is contiguous or overlapping with new entry: merge it * with new entry, and delete: */ start = min(start, le64_to_cpu(e->start)); end = max(end, le64_to_cpu(e->end)); array_remove_item(bl->start, nr, i); } bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, sb_blacklist_u64s(nr + 1)); if (!bl) { ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; goto out; } array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { .start = cpu_to_le64(start), .end = cpu_to_le64(end), })); c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); out: mutex_unlock(&c->sb_lock); return ret ?: bch2_blacklist_table_initialize(c); } static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) { const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; return cmp_int(l->start, r->start); } bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, bool dirty) { struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; struct journal_seq_blacklist_table_entry search = { .start = seq }; int idx; if (!t) return false; idx = eytzinger0_find_le(t->entries, t->nr, sizeof(t->entries[0]), journal_seq_blacklist_table_cmp, &search); if (idx < 0) return false; BUG_ON(t->entries[idx].start > seq); if (seq >= t->entries[idx].end) return false; if (dirty) t->entries[idx].dirty = true; return true; } int bch2_blacklist_table_initialize(struct bch_fs *c) { struct bch_sb_field_journal_seq_blacklist *bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); struct journal_seq_blacklist_table *t; unsigned i, nr = blacklist_nr_entries(bl); if (!bl) return 0; t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) return -BCH_ERR_ENOMEM_blacklist_table_init; t->nr = nr; for (i = 0; i < nr; i++) { t->entries[i].start = le64_to_cpu(bl->start[i].start); t->entries[i].end = le64_to_cpu(bl->start[i].end); } eytzinger0_sort(t->entries, t->nr, sizeof(t->entries[0]), journal_seq_blacklist_table_cmp, NULL); kfree(c->journal_seq_blacklist_table); c->journal_seq_blacklist_table = t; return 0; } static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); unsigned i, nr = blacklist_nr_entries(bl); for (i = 0; i < nr; i++) { struct journal_seq_blacklist_entry *e = bl->start + i; if (le64_to_cpu(e->start) >= le64_to_cpu(e->end)) { prt_printf(err, "entry %u start >= end (%llu >= %llu)", i, le64_to_cpu(e->start), le64_to_cpu(e->end)); return -BCH_ERR_invalid_sb_journal_seq_blacklist; } if (i + 1 < nr && le64_to_cpu(e[0].end) > le64_to_cpu(e[1].start)) { prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); return -BCH_ERR_invalid_sb_journal_seq_blacklist; } } return 0; } static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); struct journal_seq_blacklist_entry *i; unsigned nr = blacklist_nr_entries(bl); for (i = bl->start; i < bl->start + nr; i++) { if (i != bl->start) prt_printf(out, " "); prt_printf(out, "%llu-%llu", le64_to_cpu(i->start), le64_to_cpu(i->end)); } prt_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { .validate = bch2_sb_journal_seq_blacklist_validate, .to_text = bch2_sb_journal_seq_blacklist_to_text }; bool bch2_blacklist_entries_gc(struct bch_fs *c) { struct journal_seq_blacklist_entry *src, *dst; struct bch_sb_field_journal_seq_blacklist *bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); if (!bl) return false; unsigned nr = blacklist_nr_entries(bl); dst = bl->start; struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); unsigned i; for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); src < bl->start + nr; src++, i = eytzinger0_next(i, nr)) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; } unsigned new_nr = dst - bl->start; if (new_nr == nr) return false; bch_verbose(c, "nr blacklist entries was %u, now %u", nr, new_nr); bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, new_nr ? sb_blacklist_u64s(new_nr) : 0); BUG_ON(new_nr && !bl); return true; }
19 21 21 1 3 21 19 19 22 22 21 21 19 19 18 19 3 3 18 19 19 19 19 3 3 19 19 19 19 19 19 19 19 19 19 19 3 3 19 19 3 3 3 19 19 19 20 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/uverbs_ioctl.h> #include "rxe.h" #include "rxe_queue.h" #include "rxe_hw_counters.h" static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr); /* dev */ static int rxe_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (udata->inlen || udata->outlen) { rxe_dbg_dev(rxe, "malformed udata\n"); err = -EINVAL; goto err_out; } memcpy(attr, &rxe->attr, sizeof(*attr)); return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); struct net_device *ndev; int err, ret; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } ndev = rxe_ib_device_get_netdev(ibdev); if (!ndev) { err = -ENODEV; goto err_out; } memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, &attr->active_width); attr->state = ib_get_curr_port_state(ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); dev_put(ndev); return ret; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (index != 0) { err = -EINVAL; rxe_dbg_dev(rxe, "bad pkey index = %d\n", index); goto err_out; } *pkey = IB_DEFAULT_PKEY_FULL; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | IB_DEVICE_MODIFY_NODE_DESC)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); if (mask & IB_DEVICE_MODIFY_NODE_DESC) { memcpy(rxe->ib_dev.node_desc, attr->node_desc, sizeof(rxe->ib_dev.node_desc)); } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_port(struct ib_device *ibdev, u32 port_num, int mask, struct ib_port_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); struct rxe_port *port; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } //TODO is shutdown useful if (mask & ~(IB_PORT_RESET_QKEY_CNTR)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } port = &rxe->port; port->attr.port_cap_flags |= attr->set_port_cap_mask; port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; if (mask & IB_PORT_RESET_QKEY_CNTR) port->attr.qkey_viol_cntr = 0; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static enum rdma_link_layer rxe_get_link_layer(struct ib_device *ibdev, u32 port_num) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } return IB_LINK_LAYER_ETHERNET; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { struct rxe_dev *rxe = to_rdev(ibdev); struct ib_port_attr attr = {}; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } err = ib_query_port(ibdev, port_num, &attr); if (err) goto err_out; immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } /* uc */ static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibuc->device); struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_add_to_pool(&rxe->uc_pool, uc); if (err) rxe_err_dev(rxe, "unable to create uc\n"); return err; } static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_cleanup(uc); if (err) rxe_err_uc(uc, "cleanup failed, err = %d\n", err); } /* pd */ static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_add_to_pool(&rxe->pd_pool, pd); if (err) { rxe_dbg_dev(rxe, "unable to alloc pd\n"); goto err_out; } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_cleanup(pd); if (err) rxe_err_pd(pd, "cleanup failed, err = %d\n", err); return 0; } /* ah */ static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); struct rxe_create_ah_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { /* test if new user provider */ if (udata->outlen >= sizeof(*uresp)) uresp = udata->outbuf; ah->is_user = true; } else { ah->is_user = false; } err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) { rxe_dbg_dev(rxe, "unable to create ah\n"); goto err_out; } /* create index > 0 */ ah->ah_num = ah->elem.index; err = rxe_ah_chk_attr(ah, init_attr->ah_attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_cleanup; } if (uresp) { /* only if new user provider */ err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { err = -EFAULT; rxe_dbg_ah(ah, "unable to copy to user\n"); goto err_cleanup; } } else if (ah->is_user) { /* only if old user provider */ ah->ah_num = 0; } rxe_init_av(init_attr->ah_attr, &ah->av); rxe_finalize(ah); return 0; err_cleanup: cleanup_err = rxe_cleanup(ah); if (cleanup_err) rxe_err_ah(ah, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_ah_chk_attr(ah, attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_out; } rxe_init_av(attr, &ah->av); return 0; err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); memset(attr, 0, sizeof(*attr)); attr->type = ibah->type; rxe_av_to_attr(&ah->av, attr); return 0; } static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); if (err) rxe_err_ah(ah, "cleanup failed, err = %d\n", err); return 0; } /* srq */ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_pd *pd = to_rpd(ibsrq->pd); struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_create_srq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_err_dev(rxe, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } if (init->srq_type != IB_SRQT_BASIC) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "srq type = %d, not supported\n", init->srq_type); goto err_out; } err = rxe_srq_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "invalid init attributes\n"); goto err_out; } err = rxe_add_to_pool(&rxe->srq_pool, srq); if (err) { rxe_dbg_dev(rxe, "unable to create srq, err = %d\n", err); goto err_out; } rxe_get(pd); srq->pd = pd; err = rxe_srq_from_init(rxe, srq, init, udata, uresp); if (err) { rxe_dbg_srq(srq, "create srq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(srq); if (cleanup_err) rxe_err_srq(srq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_modify_srq_cmd cmd = {}; int err; if (udata) { if (udata->inlen < sizeof(cmd)) { err = -EINVAL; rxe_dbg_srq(srq, "malformed udata\n"); goto err_out; } err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); if (err) { err = -EFAULT; rxe_dbg_srq(srq, "unable to read udata\n"); goto err_out; } } err = rxe_srq_chk_attr(rxe, srq, attr, mask); if (err) { rxe_dbg_srq(srq, "bad init attributes\n"); goto err_out; } err = rxe_srq_from_attr(rxe, srq, attr, mask, &cmd, udata); if (err) { rxe_dbg_srq(srq, "bad attr\n"); goto err_out; } return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; if (srq->error) { err = -EINVAL; rxe_dbg_srq(srq, "srq in error state\n"); goto err_out; } attr->max_wr = srq->rq.queue->buf->index_mask; attr->max_sge = srq->rq.max_sge; attr->srq_limit = srq->limit; return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_srq *srq = to_rsrq(ibsrq); unsigned long flags; spin_lock_irqsave(&srq->rq.producer_lock, flags); while (wr) { err = post_one_recv(&srq->rq, wr); if (unlikely(err)) break; wr = wr->next; } spin_unlock_irqrestore(&srq->rq.producer_lock, flags); if (err) { *bad_wr = wr; rxe_err_srq(srq, "returned err = %d\n", err); } return err; } static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; err = rxe_cleanup(srq); if (err) rxe_err_srq(srq, "cleanup failed, err = %d\n", err); return 0; } /* qp */ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_pd *pd = to_rpd(ibqp->pd); struct rxe_qp *qp = to_rqp(ibqp); struct rxe_create_qp_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->inlen) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } qp->is_user = true; uresp = udata->outbuf; } else { qp->is_user = false; } if (init->create_flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported create_flags, err = %d\n", err); goto err_out; } err = rxe_qp_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "bad init attr, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->qp_pool, qp); if (err) { rxe_dbg_dev(rxe, "unable to create qp, err = %d\n", err); goto err_out; } err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata); if (err) { rxe_dbg_qp(qp, "create qp failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(qp); return 0; err_cleanup: cleanup_err = rxe_cleanup(qp); if (cleanup_err) rxe_err_qp(qp, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); int err; if (mask & ~IB_QP_ATTR_STANDARD_BITS) { err = -EOPNOTSUPP; rxe_dbg_qp(qp, "unsupported mask = 0x%x, err = %d\n", mask, err); goto err_out; } err = rxe_qp_chk_attr(rxe, qp, attr, mask); if (err) { rxe_dbg_qp(qp, "bad mask/attr, err = %d\n", err); goto err_out; } err = rxe_qp_from_attr(qp, attr, mask, udata); if (err) { rxe_dbg_qp(qp, "modify qp failed, err = %d\n", err); goto err_out; } if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, qp->ibqp.qp_num, qp->attr.dest_qp_num); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_qp_init_attr *init) { struct rxe_qp *qp = to_rqp(ibqp); rxe_qp_to_init(qp, init); rxe_qp_to_attr(qp, attr, mask); return 0; } static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct rxe_qp *qp = to_rqp(ibqp); int err; err = rxe_qp_chk_destroy(qp); if (err) { rxe_dbg_qp(qp, "unable to destroy qp, err = %d\n", err); goto err_out; } err = rxe_cleanup(qp); if (err) rxe_err_qp(qp, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } /* send wr */ /* sanity check incoming send work request */ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int *maskp, unsigned int *lengthp) { int num_sge = ibwr->num_sge; struct rxe_sq *sq = &qp->sq; unsigned int mask = 0; unsigned long length = 0; int err = -EINVAL; int i; do { mask = wr_opcode_mask(ibwr->opcode, qp); if (!mask) { rxe_err_qp(qp, "bad wr opcode for qp type\n"); break; } if (num_sge > sq->max_sge) { rxe_err_qp(qp, "num_sge > max_sge\n"); break; } length = 0; for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; if (length > RXE_PORT_MAX_MSG_SZ) { rxe_err_qp(qp, "message length too long\n"); break; } if (mask & WR_ATOMIC_MASK) { if (length != 8) { rxe_err_qp(qp, "atomic length != 8\n"); break; } if (atomic_wr(ibwr)->remote_addr & 0x7) { rxe_err_qp(qp, "misaligned atomic address\n"); break; } } if (ibwr->send_flags & IB_SEND_INLINE) { if (!(mask & WR_INLINE_MASK)) { rxe_err_qp(qp, "opcode doesn't support inline data\n"); break; } if (length > sq->max_inline) { rxe_err_qp(qp, "inline length too big\n"); break; } } err = 0; } while (0); *maskp = mask; *lengthp = (int)length; return err; } static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, const struct ib_send_wr *ibwr) { wr->wr_id = ibwr->wr_id; wr->opcode = ibwr->opcode; wr->send_flags = ibwr->send_flags; if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) { struct ib_ah *ibah = ud_wr(ibwr)->ah; wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; wr->wr.ud.ah_num = to_rah(ibah)->ah_num; if (qp_type(qp) == IB_QPT_GSI) wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND: break; default: rxe_err_qp(qp, "bad wr opcode %d for UD/GSI QP\n", wr->opcode); return -EINVAL; } } else { switch (wr->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; fallthrough; case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_RDMA_READ_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: wr->wr.atomic.remote_addr = atomic_wr(ibwr)->remote_addr; wr->wr.atomic.compare_add = atomic_wr(ibwr)->compare_add; wr->wr.atomic.swap = atomic_wr(ibwr)->swap; wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; break; case IB_WR_LOCAL_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_REG_MR: wr->wr.reg.mr = reg_wr(ibwr)->mr; wr->wr.reg.key = reg_wr(ibwr)->key; wr->wr.reg.access = reg_wr(ibwr)->access; break; case IB_WR_SEND: case IB_WR_BIND_MW: case IB_WR_FLUSH: case IB_WR_ATOMIC_WRITE: break; default: rxe_err_qp(qp, "unsupported wr opcode %d\n", wr->opcode); return -EINVAL; } } return 0; } static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, const struct ib_send_wr *ibwr) { struct ib_sge *sge = ibwr->sg_list; u8 *p = wqe->dma.inline_data; int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length, struct rxe_send_wqe *wqe) { int num_sge = ibwr->num_sge; int err; err = init_send_wr(qp, &wqe->wr, ibwr); if (err) return err; /* local operation */ if (unlikely(mask & WR_LOCAL_OP_MASK)) { wqe->mask = mask; wqe->state = wqe_state_posted; return 0; } if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) copy_inline_data_to_wqe(wqe, ibwr); else memcpy(wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr : mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0; wqe->mask = mask; wqe->dma.length = length; wqe->dma.resid = length; wqe->dma.num_sge = num_sge; wqe->dma.cur_sge = 0; wqe->dma.sge_offset = 0; wqe->state = wqe_state_posted; wqe->ssn = atomic_add_return(1, &qp->ssn); return 0; } static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr) { int err; struct rxe_sq *sq = &qp->sq; struct rxe_send_wqe *send_wqe; unsigned int mask; unsigned int length; int full; err = validate_send_wr(qp, ibwr, &mask, &length); if (err) return err; full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { rxe_err_qp(qp, "send queue full\n"); return -ENOMEM; } send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); err = init_send_wqe(qp, ibwr, mask, length, send_wqe); if (!err) queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); return err; } static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *ibwr, const struct ib_send_wr **bad_wr) { int err = 0; unsigned long flags; int good = 0; spin_lock_irqsave(&qp->sq.sq_lock, flags); while (ibwr) { err = post_one_send(qp, ibwr); if (err) { *bad_wr = ibwr; break; } else { good++; } ibwr = ibwr->next; } spin_unlock_irqrestore(&qp->sq.sq_lock, flags); /* kickoff processing of any posted wqes */ if (good) rxe_sched_task(&qp->send_task); return err; } static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { struct rxe_qp *qp = to_rqp(ibqp); int err; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_err_qp(qp, "qp not ready to send\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->is_user) { /* Utilize process context to do protocol processing */ rxe_sched_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) return err; } return 0; } /* recv wr */ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) { int i; unsigned long length; struct rxe_recv_wqe *recv_wqe; int num_sge = ibwr->num_sge; int full; int err; full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { err = -ENOMEM; rxe_dbg("queue full\n"); goto err_out; } if (unlikely(num_sge > rq->max_sge)) { err = -EINVAL; rxe_dbg("bad num_sge > max_sge\n"); goto err_out; } length = 0; for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; if (length > RXE_PORT_MAX_MSG_SZ) { err = -EINVAL; rxe_dbg("message length too long\n"); goto err_out; } recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); recv_wqe->wr_id = ibwr->wr_id; recv_wqe->dma.length = length; recv_wqe->dma.resid = length; recv_wqe->dma.num_sge = num_sge; recv_wqe->dma.cur_sge = 0; recv_wqe->dma.sge_offset = 0; memcpy(recv_wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); return 0; err_out: rxe_dbg("returned err = %d\n", err); return err; } static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_qp *qp = to_rqp(ibqp); struct rxe_rq *rq = &qp->rq; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } /* see C10-97.2.1 */ if (unlikely((qp_state(qp) < IB_QPS_INIT))) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_dbg_qp(qp, "qp not ready to post recv\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(qp->srq)) { *bad_wr = wr; rxe_dbg_qp(qp, "qp has srq, use post_srq_recv instead\n"); return -EINVAL; } spin_lock_irqsave(&rq->producer_lock, flags); while (wr) { err = post_one_recv(rq, wr); if (unlikely(err)) { *bad_wr = wr; break; } wr = wr->next; } spin_unlock_irqrestore(&rq->producer_lock, flags); spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->recv_task); spin_unlock_irqrestore(&qp->state_lock, flags); return err; } /* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs) { struct ib_udata *udata = &attrs->driver_udata; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } uresp = udata->outbuf; } if (attr->flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "bad attr->flags, err = %d\n", err); goto err_out; } err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) { rxe_dbg_dev(rxe, "bad init attributes, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->cq_pool, cq); if (err) { rxe_dbg_dev(rxe, "unable to create cq, err = %d\n", err); goto err_out; } err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); if (err) { rxe_dbg_cq(cq, "create cq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(cq); if (cleanup_err) rxe_err_cq(cq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); struct rxe_resize_cq_resp __user *uresp = NULL; int err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_cq(cq, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) { rxe_dbg_cq(cq, "bad attr, err = %d\n", err); goto err_out; } err = rxe_cq_resize_queue(cq, cqe, uresp, udata); if (err) { rxe_dbg_cq(cq, "resize cq failed, err = %d\n", err); goto err_out; } return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { int i; struct rxe_cq *cq = to_rcq(ibcq); struct rxe_cqe *cqe; unsigned long flags; spin_lock_irqsave(&cq->cq_lock, flags); for (i = 0; i < num_entries; i++) { cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); if (!cqe) break; /* queue empty */ memcpy(wc++, &cqe->ibwc, sizeof(*wc)); queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); } spin_unlock_irqrestore(&cq->cq_lock, flags); return i; } static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) { struct rxe_cq *cq = to_rcq(ibcq); int count; count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP); return (count > wc_cnt) ? wc_cnt : count; } static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct rxe_cq *cq = to_rcq(ibcq); int ret = 0; int empty; unsigned long irq_flags; spin_lock_irqsave(&cq->cq_lock, irq_flags); cq->notify |= flags & IB_CQ_SOLICITED_MASK; empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) ret = 1; spin_unlock_irqrestore(&cq->cq_lock, irq_flags); return ret; } static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); int err; /* See IBA C11-17: The CI shall return an error if this Verb is * invoked while a Work Queue is still associated with the CQ. */ if (atomic_read(&cq->num_wq)) { err = -EINVAL; rxe_dbg_cq(cq, "still in use\n"); goto err_out; } err = rxe_cleanup(cq); if (err) rxe_err_cq(cq, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } /* mr */ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_dev(rxe, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; rxe_mr_init_dma(access, mr); rxe_finalize(mr); return &mr->ibmr; err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access, RXE_ACCESS_SUPPORTED_MR); return ERR_PTR(-EOPNOTSUPP); } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_pd(pd, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_user(rxe, start, length, access, mr); if (err) { rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, u64 iova, int access, struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); struct rxe_pd *old_pd = to_rpd(ibmr->pd); struct rxe_pd *pd = to_rpd(ibpd); /* for now only support the two easy cases: * rereg_pd and rereg_access */ if (flags & ~RXE_MR_REREG_SUPPORTED) { rxe_err_mr(mr, "flags = %#x not supported\n", flags); return ERR_PTR(-EOPNOTSUPP); } if (flags & IB_MR_REREG_PD) { rxe_put(old_pd); rxe_get(pd); mr->ibmr.pd = ibpd; } if (flags & IB_MR_REREG_ACCESS) { if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_mr(mr, "access = %#x not supported\n", access); return ERR_PTR(-EOPNOTSUPP); } mr->access = access; } return NULL; } static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (mr_type != IB_MR_TYPE_MEM_REG) { err = -EINVAL; rxe_dbg_pd(pd, "mr type %d not supported, err = %d\n", mr_type, err); goto err_out; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_fast(max_num_sg, mr); if (err) { rxe_dbg_mr(mr, "alloc_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", err); err_free: kfree(mr); err_out: rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); int err, cleanup_err; /* See IBA 10.6.7.2.6 */ if (atomic_read(&mr->num_mw) > 0) { err = -EINVAL; rxe_dbg_mr(mr, "mr has mw's bound\n"); goto err_out; } cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); kfree_rcu_mightsleep(mr); return 0; err_out: rxe_err_mr(mr, "returned err = %d\n", err); return err; } static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { struct rxe_dev *rxe = rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); return sysfs_emit(buf, "%s\n", rxe_parent_name(rxe, 1)); } static DEVICE_ATTR_RO(parent); static struct attribute *rxe_dev_attributes[] = { &dev_attr_parent.attr, NULL }; static const struct attribute_group rxe_attr_group = { .attrs = rxe_dev_attributes, }; static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); struct net_device *ndev; ndev = rxe_ib_device_get_netdev(ib_dev); if (!ndev) return -ENODEV; rxe_set_port_state(rxe); dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); dev_put(ndev); return 0; } static const struct ib_device_ops rxe_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_RXE, .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, .alloc_hw_port_stats = rxe_ib_alloc_hw_port_stats, .alloc_mr = rxe_alloc_mr, .alloc_mw = rxe_alloc_mw, .alloc_pd = rxe_alloc_pd, .alloc_ucontext = rxe_alloc_ucontext, .attach_mcast = rxe_attach_mcast, .create_ah = rxe_create_ah, .create_cq = rxe_create_cq, .create_qp = rxe_create_qp, .create_srq = rxe_create_srq, .create_user_ah = rxe_create_ah, .dealloc_driver = rxe_dealloc, .dealloc_mw = rxe_dealloc_mw, .dealloc_pd = rxe_dealloc_pd, .dealloc_ucontext = rxe_dealloc_ucontext, .dereg_mr = rxe_dereg_mr, .destroy_ah = rxe_destroy_ah, .destroy_cq = rxe_destroy_cq, .destroy_qp = rxe_destroy_qp, .destroy_srq = rxe_destroy_srq, .detach_mcast = rxe_detach_mcast, .device_group = &rxe_attr_group, .enable_driver = rxe_enable_driver, .get_dma_mr = rxe_get_dma_mr, .get_hw_stats = rxe_ib_get_hw_stats, .get_link_layer = rxe_get_link_layer, .get_port_immutable = rxe_port_immutable, .map_mr_sg = rxe_map_mr_sg, .mmap = rxe_mmap, .modify_ah = rxe_modify_ah, .modify_device = rxe_modify_device, .modify_port = rxe_modify_port, .modify_qp = rxe_modify_qp, .modify_srq = rxe_modify_srq, .peek_cq = rxe_peek_cq, .poll_cq = rxe_poll_cq, .post_recv = rxe_post_recv, .post_send = rxe_post_send, .post_srq_recv = rxe_post_srq_recv, .query_ah = rxe_query_ah, .query_device = rxe_query_device, .query_pkey = rxe_query_pkey, .query_port = rxe_query_port, .query_qp = rxe_query_qp, .query_srq = rxe_query_srq, .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, .rereg_user_mr = rxe_rereg_user_mr, .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, rxe_qp, ibqp), INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; strscpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, ndev->dev_addr); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; err = rxe_icrc_init(rxe); if (err) return err; err = ib_register_device(dev, ibdev_name, NULL); if (err) rxe_dbg_dev(rxe, "failed with error %d\n", err); /* * Note that rxe may be invalid at this point if another thread * unregistered it. */ return err; }
150 150 150 150 150 5 146 9 9 71 7 69 53 6 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/ipv6.h> #include <linux/in6.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> #include <linux/rcupdate.h> #include <linux/sysctl.h> #include <net/ipv6_frag.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #endif #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP6_DEFRAG_CONNTRACK_IN + zone_id; else return IP6_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv6_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int err; #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Previously seen (loopback)? */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; if (skb->_nfct == IP_CT_UNTRACKED) return NF_ACCEPT; #endif err = nf_ct_frag6_gather(state->net, skb, nf_ct6_defrag_user(state->hook, skb)); /* queued */ if (err == -EINPROGRESS) return NF_STOLEN; return err == 0 ? NF_ACCEPT : NF_DROP; } static const struct nf_hook_ops ipv6_defrag_ops[] = { { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag6_net_exit(struct net *net) { if (net->nf.defrag_ipv6_users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); net->nf.defrag_ipv6_users = 0; } } static const struct nf_defrag_hook defrag_hook = { .owner = THIS_MODULE, .enable = nf_defrag_ipv6_enable, .disable = nf_defrag_ipv6_disable, }; static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; static int __init nf_defrag_init(void) { int ret = 0; ret = nf_ct_frag6_init(); if (ret < 0) { pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); return ret; } ret = register_pernet_subsys(&defrag6_net_ops); if (ret < 0) { pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook); return ret; cleanup_frag6: nf_ct_frag6_cleanup(); return ret; } static void __exit nf_defrag_fini(void) { rcu_assign_pointer(nf_defrag_v6_hook, NULL); unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } int nf_defrag_ipv6_enable(struct net *net) { int err = 0; mutex_lock(&defrag6_mutex); if (net->nf.defrag_ipv6_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } if (net->nf.defrag_ipv6_users) { net->nf.defrag_ipv6_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) net->nf.defrag_ipv6_users = 1; out_unlock: mutex_unlock(&defrag6_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); void nf_defrag_ipv6_disable(struct net *net) { mutex_lock(&defrag6_mutex); if (net->nf.defrag_ipv6_users) { net->nf.defrag_ipv6_users--; if (net->nf.defrag_ipv6_users == 0) nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); } mutex_unlock(&defrag6_mutex); } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6 defragmentation support");
1 96 96 97 143 143 57 97 59 59 36 12 22 14 14 2 27 27 27 27 27 5 27 11 42 42 42 2 2 1 2 1 1 2 2 2 2 2 2 1 41 15 37 42 42 42 42 42 2 1 104 7 24 120 120 102 55 59 4 1 1 25 25 15 22 22 22 44 26 30 29 4 4 4 4 4 15 13 14 27 21 33 33 33 32 31 15 8 8 8 3 3 3 63 8 55 63 63 63 9 54 42 42 21 24 42 77 77 77 2 2 2 72 51 33 5 13 36 36 23 29 55 55 57 57 57 56 55 2 2 2 2 2 2 2 2 2 2 88 71 21 72 71 35 33 32 1 2 39 1 65 61 61 61 61 120 120 116 31 58 120 16 14 14 120 31 99 44 31 120 17 117 118 118 43 25 4 4 68 1 69 69 68 69 69 69 68 1 69 3 3 46 46 4 69 68 68 67 19 66 2 67 67 67 67 45 67 67 66 67 67 1 1 25 25 25 25 25 25 2 25 19 24 25 25 1 25 25 17 24 69 67 6 16 10 69 68 69 67 67 64 52 23 22 67 72 72 72 22 69 5 37 69 4 67 18 16 69 48 65 25 67 31 30 4 26 2 3 23 1 5 70 71 32 71 62 92 92 92 82 82 3 2 79 78 71 9 71 41 2 70 71 70 2 68 68 41 66 69 9 40 26 11 32 3 8 16 24 1 1 4 18 32 32 32 24 8 3 24 13 10 2 9 4 9 9 9 9 9 9 8 487 486 18 25 25 35 35 8 25 25 25 35 5 4 35 33 1 1 33 32 31 31 32 32 32 9 11 1 1 4 1 1 2 1 6 15 15 28 2 1 1 2 32 9 2 9 5 2 9 13 13 6 7 33 28 7 69 69 69 13 43 36 44 57 17 69 34 44 44 3 43 68 46 25 5 29 28 7 13 1 2 11 13 12 12 12 6 1 6 5 1 1 1 1 1 9 8 1 22 20 20 20 20 12 12 27 27 12 12 12 12 12 12 66 64 13 50 49 3 53 1 1 52 26 26 25 5 22 56 47 157 62 42 42 42 104 104 104 69 40 39 41 40 41 49 50 50 49 48 50 50 50 13 11 51 33 22 22 22 53 51 53 37 7 1 27 1 1 46 45 9 46 50 53 53 50 15 53 15 50 53 73 74 74 45 31 15 15 1 15 15 15 47 46 66 66 66 66 66 52 52 109 110 2 108 112 112 16 16 487 490 485 33 31 2 12 28 29 12 458 111 477 56 46 67 36 81 24 492 9 8 1 1 22 22 1 21 21 14 9 2 8 8 8 13 9 9 9 9 9 16 16 2 14 56 56 6 1 5 2 2 2 1 18 18 8 1 3 4 2 69 68 69 69 69 69 24 44 69 9 61 25 44 9 60 12 12 1 4 7 4 7 11 10 11 1 10 10 1 9 8 8 8 1 7 2 6 1 5 5 5 5 4 1 1 11 2 12 12 12 12 2 11 1 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/protocol.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) #include <net/transp_v6.h> #endif #include <net/mptcp.h> #include <net/hotdata.h> #include <net/xfrm.h> #include <asm/ioctls.h> #include "protocol.h" #include "mib.h" #define CREATE_TRACE_POINTS #include <trace/events/mptcp.h> #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; struct ipv6_pinfo np; }; #endif enum { MPTCP_CMSG_TS = BIT(0), MPTCP_CMSG_INQ = BIT(1), }; static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp; static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { return READ_ONCE(msk->wnd_end); } static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (sk->sk_prot == &tcpv6_prot) return &inet6_stream_ops; #endif WARN_ON_ONCE(sk->sk_prot != &tcp_prot); return &inet_stream_ops; } static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; struct socket *ssock; int err; err = mptcp_subflow_create_socket(sk, sk->sk_family, &ssock); if (err) return err; msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); subflow = mptcp_subflow_ctx(ssock->sk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssock->sk); subflow->request_mptcp = 1; subflow->subflow_id = msk->subflow_id++; /* This is the first subflow, always with id 0 */ WRITE_ONCE(subflow->local_id, 0); mptcp_sock_graft(msk->first, sk->sk_socket); iput(SOCK_INODE(ssock)); return 0; } /* If the MPC handshake is not started, returns the first subflow, * eventually allocating it. */ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; int ret; if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) return ERR_PTR(-EINVAL); if (!msk->first) { ret = __mptcp_socket_create(msk); if (ret) return ERR_PTR(ret); } return msk->first; } static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); __kfree_skb(skb); } static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) { WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, mptcp_sk(sk)->rmem_fwd_alloc + size); } static void mptcp_rmem_charge(struct sock *sk, int size) { mptcp_rmem_fwd_alloc_add(sk, -size); } static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; if (MPTCP_SKB_CB(from)->offset || ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; } static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, struct sk_buff *from) { if (MPTCP_SKB_CB(from)->map_seq != MPTCP_SKB_CB(to)->end_seq) return false; return mptcp_try_coalesce((struct sock *)msk, to, from); } static void __mptcp_rmem_reclaim(struct sock *sk, int amount) { amount >>= PAGE_SHIFT; mptcp_rmem_charge(sk, amount << PAGE_SHIFT); __sk_mem_reduce_allocated(sk, amount); } static void mptcp_rmem_uncharge(struct sock *sk, int size) { struct mptcp_sock *msk = mptcp_sk(sk); int reclaimable; mptcp_rmem_fwd_alloc_add(sk, size); reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); /* see sk_mem_uncharge() for the rationale behind the following schema */ if (unlikely(reclaimable >= PAGE_SIZE)) __mptcp_rmem_reclaim(sk, reclaimable); } static void mptcp_rfree(struct sk_buff *skb) { unsigned int len = skb->truesize; struct sock *sk = skb->sk; atomic_sub(len, &sk->sk_rmem_alloc); mptcp_rmem_uncharge(sk, len); } void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = mptcp_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, skb->truesize); } /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks */ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) { struct sock *sk = (struct sock *)msk; struct rb_node **p, *parent; u64 seq, end_seq, max_seq; struct sk_buff *skb1; seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */ mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } p = &msk->out_of_order_queue.rb_node; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUE); if (RB_EMPTY_ROOT(&msk->out_of_order_queue)) { rb_link_node(&skb->rbnode, NULL, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); msk->ooo_last_skb = skb; goto end; } /* with 2 subflows, adding at end of ooo queue is quite likely * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. */ if (mptcp_ooo_try_coalesce(msk, msk->ooo_last_skb, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); return; } /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ if (!before64(seq, MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOQUEUETAIL); parent = &msk->ooo_last_skb->rbnode; p = &parent->rb_right; goto insert; } /* Find place to insert this segment. Handle overlaps on the way. */ parent = NULL; while (*p) { parent = *p; skb1 = rb_to_skb(parent); if (before64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { p = &parent->rb_left; continue; } if (before64(seq, MPTCP_SKB_CB(skb1)->end_seq)) { if (!after64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); return; } if (after64(seq, MPTCP_SKB_CB(skb1)->map_seq)) { /* partial overlap: * | skb | * | skb1 | * continue traversing */ } else { /* skb's seq == skb1's seq and skb covers skb1. * Replace skb1 with skb. */ rb_replace_node(&skb1->rbnode, &skb->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); goto merge_right; } } else if (mptcp_ooo_try_coalesce(msk, skb1, skb)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_OFOMERGE); return; } p = &parent->rb_right; } insert: /* Insert segment into RB tree. */ rb_link_node(&skb->rbnode, parent, p); rb_insert_color(&skb->rbnode, &msk->out_of_order_queue); merge_right: /* Remove other segments covered by skb. */ while ((skb1 = skb_rb_next(skb)) != NULL) { if (before64(end_seq, MPTCP_SKB_CB(skb1)->end_seq)) break; rb_erase(&skb1->rbnode, &msk->out_of_order_queue); mptcp_drop(sk, skb1); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) msk->ooo_last_skb = skb; end: skb_condense(skb); mptcp_set_owner_r(skb, sk); } static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) { struct mptcp_sock *msk = mptcp_sk(sk); int amt, amount; if (size <= msk->rmem_fwd_alloc) return true; size -= msk->rmem_fwd_alloc; amt = sk_mem_pages(size); amount = amt << PAGE_SHIFT; if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) return false; mptcp_rmem_fwd_alloc_add(sk, amount); return true; } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb, unsigned int offset, size_t copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; struct sk_buff *tail; bool has_rxtstamp; __skb_unlink(skb, &ssk->sk_receive_queue); skb_ext_reset(skb); skb_orphan(skb); /* try to fetch required memory from subflow */ if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; } has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq * value */ MPTCP_SKB_CB(skb)->map_seq = mptcp_subflow_get_mapped_dsn(subflow); MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; mptcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { mptcp_data_queue_ofo(msk, skb); return false; } /* old data, keep it simple and drop the whole pkt, sender * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); drop: mptcp_drop(sk, skb); return false; } static void mptcp_stop_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_retransmit_timer); mptcp_sk(sk)->timer_ival = 0; } static void mptcp_close_wake_up(struct sock *sk) { if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); else sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); return ((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) && msk->write_seq == READ_ONCE(msk->snd_una); } static void mptcp_check_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* Look for an acknowledged DATA_FIN */ if (mptcp_pending_data_fin_ack(sk)) { WRITE_ONCE(msk->snd_data_fin_enable, 0); switch (sk->sk_state) { case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_FIN_WAIT2); break; case TCP_CLOSING: case TCP_LAST_ACK: mptcp_set_state(sk, TCP_CLOSE); break; } mptcp_close_wake_up(sk); } } /* can be called with no lock acquired */ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); if (READ_ONCE(msk->rcv_data_fin) && ((1 << inet_sk_state_load(sk)) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) { if (seq) *seq = rcv_data_fin_seq; return true; } } return false; } static void mptcp_set_datafin_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 retransmits; retransmits = min_t(u32, icsk->icsk_retransmits, ilog2(TCP_RTO_MAX / TCP_RTO_MIN)); mptcp_sk(sk)->timer_ival = TCP_RTO_MIN << retransmits; } static void __mptcp_set_timeout(struct sock *sk, long tout) { mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) { const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? inet_csk(ssk)->icsk_timeout - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) { struct mptcp_subflow_context *subflow; long tout = 0; mptcp_for_each_subflow(mptcp_sk(sk), subflow) tout = max(tout, mptcp_timeout_from_subflow(subflow)); __mptcp_set_timeout(sk, tout); } static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } void __mptcp_subflow_send_ack(struct sock *ssk) { if (tcp_can_send_ack(ssk)) tcp_send_ack(ssk); } static void mptcp_subflow_send_ack(struct sock *ssk) { bool slow; slow = lock_sock_fast(ssk); __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; mptcp_for_each_subflow(msk, subflow) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) { const struct inet_connection_sock *icsk = inet_csk(ssk); u8 ack_pending = READ_ONCE(icsk->icsk_ack.pending); const struct tcp_sock *tp = tcp_sk(ssk); return (ack_pending & ICSK_ACK_SCHED) && ((READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->rcv_wup) > READ_ONCE(icsk->icsk_ack.rcv_mss)) || (rx_empty && ack_pending & (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; int space = __mptcp_space(sk); bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)) && copied; rx_empty = !__mptcp_rmem(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) mptcp_subflow_cleanup_rbuf(ssk, copied); } } static bool mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; bool ret = false; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. * msk->rcv_data_fin was set when parsing the incoming options * at the subflow level and the msk lock was not held, so this * is the first opportunity to act on the DATA_FIN and change * the msk state. * * If we are caught up to the sequence number of the incoming * DATA_FIN, send the DATA_ACK now and do state transition. If * not caught up, do nothing and let the recv code send DATA_ACK * when catching up. */ if (mptcp_pending_data_fin(sk, &rcv_data_fin_seq)) { WRITE_ONCE(msk->ack_seq, msk->ack_seq + 1); WRITE_ONCE(msk->rcv_data_fin, 0); WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ switch (sk->sk_state) { case TCP_ESTABLISHED: mptcp_set_state(sk, TCP_CLOSE_WAIT); break; case TCP_FIN_WAIT1: mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: mptcp_set_state(sk, TCP_CLOSE); break; default: /* Other states not expected */ WARN_ON_ONCE(1); break; } ret = true; if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } return ret; } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) { if (READ_ONCE(msk->allow_infinite_fallback)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONFALLBACK); mptcp_do_fallback(ssk); } else { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET); mptcp_subflow_reset(ssk); } } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; bool done = false; int sk_rbuf; sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) { WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); sk_rbuf = ssk_rbuf; } } pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; u32 seq = tp->copied_seq; struct sk_buff *skb; bool fin; /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), * a different CPU can have already processed the pending * data, stop here or we can enter an infinite loop */ if (!moved) done = true; break; } if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could * collapse them between the dummy map creation and the * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; } offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (fin) { done = true; seq++; } if (offset < skb->len) { size_t len = skb->len - offset; if (tp->urg_data) done = true; if (__mptcp_move_skb(msk, ssk, skb, offset, len)) moved += len; seq += len; if (unlikely(map_remaining < len)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } } else { if (unlikely(!fin)) { DEBUG_NET_WARN_ON_ONCE(1); mptcp_dss_corruption(msk, ssk); } sk_eat_skb(ssk, skb); done = true; } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { done = true; break; } } while (more_data_avail); if (moved > 0) msk->last_data_recv = tcp_jiffies32; *bytes += moved; return done; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb, *tail; bool moved = false; struct rb_node *p; u64 end_seq; p = rb_first(&msk->out_of_order_queue); pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); while (p) { skb = rb_to_skb(p); if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) break; p = rb_next(p); rb_erase(&skb->rbnode, &msk->out_of_order_queue); if (unlikely(!after64(MPTCP_SKB_CB(skb)->end_seq, msk->ack_seq))) { mptcp_drop(sk, skb); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); continue; } end_seq = MPTCP_SKB_CB(skb)->end_seq; tail = skb_peek_tail(&sk->sk_receive_queue); if (!tail || !mptcp_ooo_try_coalesce(msk, tail, skb)) { int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; /* skip overlapping data, if any */ pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n", MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; MPTCP_SKB_CB(skb)->map_seq += delta; __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->bytes_received += end_seq - msk->ack_seq; WRITE_ONCE(msk->ack_seq, end_seq); moved = true; } return moved; } static bool __mptcp_subflow_error_report(struct sock *sk, struct sock *ssk) { int err = sock_error(ssk); int ssk_state; if (!err) return false; /* only propagate errors on fallen-back sockets or * on MPC connect */ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(mptcp_sk(sk))) return false; /* We need to propagate only transition to CLOSE state. * Orphaned socket will see such state change via * subflow_sched_work_if_closed() and that path will properly * destroy the msk as needed. */ ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) mptcp_set_state(sk, ssk_state); WRITE_ONCE(sk->sk_err, -err); /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); sk_error_report(sk); return true; } void __mptcp_error_report(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_for_each_subflow(msk, subflow) if (__mptcp_subflow_error_report(sk, mptcp_subflow_tcp_sock(subflow))) break; } /* In most cases we will be able to lock the mptcp socket. If its already * owned, we need to defer to the work queue to avoid ABBA deadlock. */ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; __mptcp_move_skbs_from_subflow(msk, ssk, &moved); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk)) __mptcp_error_report(sk); else __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); } /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but * this is not a good place to change state. Let the workqueue * do it. */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); return moved > 0; } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(sk); int sk_rbuf, ssk_rbuf; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing * more data to the msk receive queue */ if (unlikely(subflow->disposable)) return; ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); sk_rbuf = READ_ONCE(sk->sk_rcvbuf); if (unlikely(ssk_rbuf > sk_rbuf)) sk_rbuf = ssk_rbuf; /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ if (__mptcp_rmem(sk) > sk_rbuf) return; /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); mptcp_data_unlock(sk); } static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk) { mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq); WRITE_ONCE(msk->allow_infinite_fallback, false); mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC); } static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; if (sk->sk_state != TCP_ESTABLISHED) return false; /* attach to msk socket only after we are sure we will deal with it * at close time */ if (sk->sk_socket && !ssk->sk_socket) mptcp_sock_graft(ssk, sk->sk_socket); mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_subflow_joined(msk, ssk); mptcp_stop_tout_timer(sk); __mptcp_propagate_sndbuf(sk, ssk); return true; } static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list) { struct mptcp_subflow_context *tmp, *subflow; struct mptcp_sock *msk = mptcp_sk(sk); list_for_each_entry_safe(subflow, tmp, join_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); list_move_tail(&subflow->node, &msk->conn_list); if (!__mptcp_finish_join(msk, ssk)) mptcp_subflow_reset(ssk); unlock_sock_fast(ssk, slow); } } static bool mptcp_rtx_timer_pending(struct sock *sk) { return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); } static void mptcp_reset_rtx_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; /* prevent rescheduling on close */ if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return; tout = mptcp_sk(sk)->timer_ival; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } bool mptcp_schedule_work(struct sock *sk) { if (inet_sk_state_load(sk) != TCP_CLOSE && schedule_work(&mptcp_sk(sk)->work)) { /* each subflow already holds a reference to the sk, and the * workqueue is invoked by a subflow, so sk can't go away here. */ sock_hold(sk); return true; } return false; } static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->data_avail)) return mptcp_subflow_tcp_sock(subflow); } return NULL; } static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order and this * mapping has not been xmitted yet */ return mpext && mpext->data_seq + mpext->data_len == write_seq && !mpext->frozen; } /* we can append data to the given data frag if: * - there is space available in the backing page_frag * - the data frag tail matches the current page_frag free offset * - the data frag end sequence number matches the current write seq */ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, const struct page_frag *pfrag, const struct mptcp_data_frag *df) { return df && pfrag->page == df->page && pfrag->size - pfrag->offset > 0 && pfrag->offset == (df->offset + df->data_len) && df->data_seq + df->data_len == msk->write_seq; } static void dfrag_uncharge(struct sock *sk, int len) { sk_mem_uncharge(sk, len); sk_wmem_queued_add(sk, -len); } static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) { int len = dfrag->data_len + dfrag->overhead; list_del(&dfrag->list); dfrag_uncharge(sk, len); put_page(dfrag->page); } /* called under both the msk socket lock and the data lock */ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; u64 snd_una; snd_una = msk->snd_una; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; if (unlikely(dfrag == msk->first_pending)) { /* in recovery mode can see ack after the current snd head */ if (WARN_ON_ONCE(!msk->recovery)) break; WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } dfrag_clear(sk, dfrag); } dfrag = mptcp_rtx_head(sk); if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; /* prevent wrap around in recovery mode */ if (unlikely(delta > dfrag->already_sent)) { if (WARN_ON_ONCE(!msk->recovery)) goto out; if (WARN_ON_ONCE(delta > dfrag->data_len)) goto out; dfrag->already_sent += delta - dfrag->already_sent; } dfrag->data_seq += delta; dfrag->offset += delta; dfrag->data_len -= delta; dfrag->already_sent -= delta; dfrag_uncharge(sk, delta); } /* all retransmitted data acked, recovery completed */ if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) msk->recovery = false; out: if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) { if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_rtx_timer(sk); } else { mptcp_reset_rtx_timer(sk); } if (mptcp_pending_data_fin_ack(sk)) mptcp_schedule_work(sk); } static void __mptcp_clean_una_wakeup(struct sock *sk) { lockdep_assert_held_once(&sk->sk_lock.slock); __mptcp_clean_una(sk); mptcp_write_space(sk); } static void mptcp_clean_una_wakeup(struct sock *sk) { mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); mptcp_data_unlock(sk); } static void mptcp_enter_memory_pressure(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool first = true; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (first) tcp_enter_memory_pressure(ssk); sk_stream_moderate_sndbuf(ssk); first = false; } __mptcp_sync_sndbuf(sk); } /* ensure we get enough memory for the frag hdr, beyond some minimal amount of * data */ static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), pfrag, sk->sk_allocation))) return true; mptcp_enter_memory_pressure(sk); return false; } static struct mptcp_data_frag * mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, int orig_offset) { int offset = ALIGN(orig_offset, sizeof(long)); struct mptcp_data_frag *dfrag; dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); dfrag->data_len = 0; dfrag->data_seq = msk->write_seq; dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); dfrag->offset = offset + sizeof(struct mptcp_data_frag); dfrag->already_sent = 0; dfrag->page = pfrag->page; return dfrag; } struct mptcp_sendmsg_info { int mss_now; int size_goal; u16 limit; u16 sent; unsigned int flags; bool data_lock_held; }; static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; mptcp_snd_wnd = window_end - data_seq; avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; } static bool __mptcp_add_ext(struct sk_buff *skb, gfp_t gfp) { struct skb_ext *mpext = __skb_ext_alloc(gfp); if (!mpext) return false; __skb_ext_set(skb, SKB_EXT_MPTCP, mpext); return true; } static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { mptcp_enter_memory_pressure(sk); } return NULL; } static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { tcp_skb_entail(ssk, skb); return skb; } tcp_skb_tsorted_anchor_cleanup(skb); kfree_skb(skb); return NULL; } static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; return __mptcp_alloc_tx_skb(sk, ssk, gfp); } /* note: this always recompute the csum on the whole skb, even * if we just appended a single frag. More status info needed */ static void mptcp_update_data_checksum(struct sk_buff *skb, int added) { struct mptcp_ext *mpext = mptcp_get_ext(skb); __wsum csum = ~csum_unfold(mpext->csum); int offset = skb->len - added; mpext->csum = csum_fold(csum_block_add(csum, skb_checksum(skb, offset, added, 0), offset)); } static void mptcp_update_infinite_map(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_ext *mpext) { if (!mpext) return; mpext->infinite_map = 1; mpext->data_len = 0; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); mptcp_subflow_ctx(ssk)->send_infinite_map = 0; pr_fallback(msk); mptcp_do_fallback(ssk); } #define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1)) static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_data_frag *dfrag, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; bool can_coalesce = false; bool reuse_skb = true; struct sk_buff *skb; size_t copy; int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); if (WARN_ON_ONCE(info->sent > info->limit || info->limit > dfrag->data_len)) return 0; if (unlikely(!__tcp_can_send(ssk))) return -EAGAIN; /* compute send limit */ if (unlikely(ssk->sk_gso_max_size > MPTCP_MAX_GSO_SIZE)) ssk->sk_gso_max_size = MPTCP_MAX_GSO_SIZE; info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); copy = info->size_goal; skb = tcp_write_queue_tail(ssk); if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later * queue management operation, to avoid breaking the ext <-> * SSN association set here */ mpext = mptcp_get_ext(skb); if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } i = skb_shinfo(skb)->nr_frags; can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tcp_sk(ssk), skb); goto alloc_skb; } copy -= skb->len; } else { alloc_skb: skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); if (!skb) return -ENOMEM; i = skb_shinfo(skb)->nr_frags; reuse_skb = false; mpext = mptcp_get_ext(skb); } /* Zero window and all data acked? Probe. */ copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { tcp_remove_empty_skb(ssk); return 0; } zero_window_probe = true; data_seq = snd_una - 1; copy = 1; } copy = min_t(size_t, copy, info->limit - info->sent); if (!sk_wmem_schedule(ssk, copy)) { tcp_remove_empty_skb(ssk); return -ENOMEM; } if (can_coalesce) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(dfrag->page); skb_fill_page_desc(skb, i, dfrag->page, offset, copy); } skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk_wmem_queued_add(ssk, copy); sk_mem_charge(ssk, copy); WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); /* on skb reuse we just need to update the DSS len */ if (reuse_skb) { TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += copy; goto out; } memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n", mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); if (zero_window_probe) { mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) mptcp_update_data_checksum(skb, copy); if (mptcp_subflow_ctx(ssk)->send_infinite_map) mptcp_update_infinite_map(msk, ssk, mpext); trace_mptcp_sendmsg_frag(mpext); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ sizeof(struct tcphdr) - \ MAX_TCP_OPTION_SPACE - \ sizeof(struct ipv6hdr) - \ sizeof(struct frag_hdr)) struct subflow_send_info { struct sock *ssk; u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) { if (!subflow->stale) return; subflow->stale = 0; MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); } bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) { if (unlikely(subflow->stale)) { u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); if (subflow->stale_rcv_tstamp == rcv_tstamp) return false; mptcp_subflow_set_active(subflow); } return __mptcp_subflow_active(subflow); } #define SSK_MODE_ACTIVE 0 #define SSK_MODE_BACKUP 1 #define SSK_MODE_MAX 2 /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; u64 linger_time; long tout = 0; /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; send_info[i].linger_time = -1; } mptcp_for_each_subflow(msk, subflow) { bool backup = subflow->backup || subflow->request_bkup; trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !backup; pace = subflow->avg_pacing_rate; if (unlikely(!pace)) { /* init pacing rate from socket */ subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); pace = subflow->avg_pacing_rate; if (!pace) continue; } linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); if (linger_time < send_info[backup].linger_time) { send_info[backup].ssk = ssk; send_info[backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; /* According to the blest algorithm, to avoid HoL blocking for the * faster flow, we need to: * - estimate the faster flow linger time * - use the above to estimate the amount of byte transferred * by the faster flow * - check that the amount of queued data is greter than the above, * otherwise do not use the picked, slower, subflow * We select the subflow with the shorter estimated time to flush * the queued mem, which basically ensure the above. We just need * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; if (!ssk || !sk_stream_memory_free(ssk)) return NULL; burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); if (!burst) return ssk; subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); msk->snd_burst = burst; return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } static void mptcp_update_post_push(struct mptcp_sock *msk, struct mptcp_data_frag *dfrag, u32 sent) { u64 snd_nxt_new = dfrag->data_seq; dfrag->already_sent += sent; msk->snd_burst -= sent; snd_nxt_new += dfrag->already_sent; /* snd_nxt_new can be smaller than snd_nxt in case mptcp * is recovering after a failover. In that event, this re-sends * old segments. * * Thus compute snd_nxt_new candidate based on * the dfrag->data_seq that was sent and the data * that has been handed to the subflow for transmission * and skip update in case it was old dfrag. */ if (likely(after64(snd_nxt_new, msk->snd_nxt))) { msk->bytes_sent += snd_nxt_new - msk->snd_nxt; WRITE_ONCE(msk->snd_nxt, snd_nxt_new); } } void mptcp_check_and_set_pending(struct sock *sk) { if (mptcp_send_head(sk)) { mptcp_data_lock(sk); mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING); mptcp_data_unlock(sk); } } static int __subflow_push_pending(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dfrag; int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { info->sent = dfrag->already_sent; info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { err = copied ? : ret; goto out; } info->sent += ret; copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { err = copied; goto out; } mptcp_set_timeout(sk); } err = copied; out: if (err > 0) msk->last_data_sent = tcp_jiffies32; return err; } void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .flags = flags, }; bool do_check_data_fin = false; int push_count = 1; while (mptcp_send_head(sk) && (push_count > 0)) { struct mptcp_subflow_context *subflow; int ret = 0; if (mptcp_sched_get_send(msk)) break; push_count = 0; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); prev_ssk = ssk; ssk = mptcp_subflow_tcp_sock(subflow); if (ssk != prev_ssk) { /* First check. If the ssk has changed since * the last round, release prev_ssk */ if (prev_ssk) mptcp_push_release(prev_ssk, &info); /* Need to lock the new subflow only if different * from the previous one, otherwise we are still * helding the relevant lock */ lock_sock(ssk); } push_count++; ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) { if (ret != -EAGAIN || (1 << ssk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) push_count--; continue; } do_check_data_fin = true; } } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); /* ensure the rtx timer is running */ if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (do_check_data_fin) mptcp_check_send_data_fin(sk); } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { .data_lock_held = true, }; bool keep_pushing = true; struct sock *xmit_ssk; int copied = 0; info.flags = 0; while (mptcp_send_head(sk) && keep_pushing) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); int ret = 0; /* check for a different subflow usage only after * spooling the first chunk of data */ if (first) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); first = false; if (ret <= 0) break; copied += ret; continue; } if (mptcp_sched_get_send(msk)) goto out; if (READ_ONCE(subflow->scheduled)) { mptcp_subflow_set_scheduled(subflow, false); ret = __subflow_push_pending(sk, ssk, &info); if (ret <= 0) keep_pushing = false; copied += ret; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { xmit_ssk = mptcp_subflow_tcp_sock(subflow); if (xmit_ssk != ssk) { mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SEND); keep_pushing = false; } } } } out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ if (copied) { tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); if (msk->snd_data_fin_enable && msk->snd_nxt + 1 == msk->write_seq) mptcp_schedule_work(sk); } } static int mptcp_disconnect(struct sock *sk, int flags); static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, size_t len, int *copied_syn) { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; int ret; /* on flags based fastopen the mptcp is supposed to create the * first subflow right now. Otherwise we are in the defer_connect * path, and the first subflow must be already present. * Since the defer_connect flag is cleared after the first succsful * fastopen attempt, no need to check for additional subflow status. */ if (msg->msg_flags & MSG_FASTOPEN) { ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); } if (!msk->first) return -EINVAL; ssk = msk->first; lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->fastopening = 1; ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); msk->fastopening = 0; msg->msg_flags = saved_flags; release_sock(ssk); /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, msg->msg_namelen, msg->msg_flags, 1); /* Keep the same behaviour of plain TCP: zero the copied bytes in * case of any error, except timeout or signal */ if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) *copied_syn = 0; } else if (ret && ret != -EINPROGRESS) { /* The disconnect() op called by tcp_sendmsg_fastopen()/ * __inet_stream_connect() can fail, due to looking check, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ if (!mptcp_disconnect(sk, 0)) { sk->sk_disconnects++; sk->sk_socket->state = SS_UNCONNECTED; } } inet_clear_bit(DEFER_CONNECT, sk); return ret; } static int do_copy_data_nocache(struct sock *sk, int copy, struct iov_iter *from, char *to) { if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) { return -EFAULT; } return 0; } /* open-code sk_stream_memory_free() plus sent limit computation to * avoid indirect calls in fast-path. * Called under the msk socket lock, so we can avoid a bunch of ONCE * annotations. */ static u32 mptcp_send_limit(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); u32 limit, not_sent; if (sk->sk_wmem_queued >= READ_ONCE(sk->sk_sndbuf)) return 0; limit = mptcp_notsent_lowat(sk); if (limit == UINT_MAX) return UINT_MAX; not_sent = msk->write_seq - msk->snd_nxt; if (not_sent >= limit) return 0; return limit - not_sent; } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; size_t copied = 0; int ret = 0; long timeo; /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; lock_sock(sk); if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); copied += copied_syn; if (ret == -EINPROGRESS && copied_syn > 0) goto out; else if (ret) goto do_error; } timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) goto do_error; } ret = -EPIPE; if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) goto do_error; pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { int total_ts, frag_truesize = 0; struct mptcp_data_frag *dfrag; bool dfrag_collapsed; size_t psize, offset; u32 copy_limit; /* ensure fitting the notsent_lowat() constraint */ copy_limit = mptcp_send_limit(sk); if (!copy_limit) goto wait_for_memory; /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ dfrag = mptcp_pending_tail(sk); dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); if (!dfrag_collapsed) { if (!mptcp_page_frag_refill(sk, pfrag)) goto wait_for_memory; dfrag = mptcp_carve_data_frag(msk, pfrag, pfrag->offset); frag_truesize = dfrag->overhead; } /* we do not bound vs wspace, to allow a single packet. * memory accounting will prevent execessive memory usage * anyway */ offset = dfrag->offset + dfrag->data_len; psize = pfrag->size - offset; psize = min_t(size_t, psize, msg_data_left(msg)); psize = min_t(size_t, psize, copy_limit); total_ts = psize + frag_truesize; if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; ret = do_copy_data_nocache(sk, psize, &msg->msg_iter, page_address(dfrag->page) + offset); if (ret) goto do_error; /* data successfully copied into the write queue */ sk_forward_alloc_add(sk, -total_ts); copied += psize; dfrag->data_len += psize; frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk */ sk_wmem_queued_add(sk, frag_truesize); if (!dfrag_collapsed) { get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); continue; wait_for_memory: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto do_error; } if (copied) __mptcp_push_pending(sk, msg->msg_flags); out: release_sock(sk); return copied; do_error: if (copied) goto out; copied = sk_stream_error(sk, msg->msg_flags, ret); goto out; } static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct sk_buff *skb, *tmp; int copied = 0; skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); int err; if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { if (!copied) return err; break; } } if (MPTCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= MPTCP_CMSG_TS; } copied += count; if (count < data_len) { if (!(flags & MSG_PEEK)) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; msk->bytes_consumed += count; } break; } if (!(flags & MSG_PEEK)) { /* we will bulk release the skb memory later */ skb->destructor = NULL; WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); __skb_unlink(skb, &msk->receive_queue); __kfree_skb(skb); msk->bytes_consumed += count; } if (copied >= len) break; } mptcp_rcv_space_adjust(msk, copied); return copied; } /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. * * Only difference: Use highest rtt estimate of the subflows in use. */ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; msk_owned_by_me(msk); if (copied <= 0) return; if (!msk->rcvspace_init) mptcp_rcv_space_init(msk, msk->first); msk->rcvq_space.copied += copied; mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); rtt_us = msk->rcvq_space.rtt_us; if (rtt_us && time < (rtt_us >> 3)) return; rtt_us = 0; mptcp_for_each_subflow(msk, subflow) { const struct tcp_sock *tp; u64 sf_rtt_us; u32 sf_advmss; tp = tcp_sk(mptcp_subflow_tcp_sock(subflow)); sf_rtt_us = READ_ONCE(tp->rcv_rtt_est.rtt_us); sf_advmss = READ_ONCE(tp->advmss); rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { u64 rcvwin, grow; int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; window_clamp = mptcp_win_from_space(sk, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to * the mptcp rx queue fast enough (announced rcv_win can * exceed ssk->sk_rcvbuf). */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk; bool slow; ssk = mptcp_subflow_tcp_sock(subflow); slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); if (tcp_can_send_ack(ssk)) tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } } } msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; } static void __mptcp_update_rmem(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->rmem_released) return; atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); mptcp_rmem_uncharge(sk, msk->rmem_released); WRITE_ONCE(msk->rmem_released, 0); } static void __mptcp_splice_receive_queue(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); } static bool __mptcp_move_skbs(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int moved = 0; bool ret, done; do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; /* we can have data pending in the subflows only if the msk * receive buffer was full at subflow_data_ready() time, * that is an unlikely slow path. */ if (likely(!ssk)) break; slowpath = lock_sock_fast(ssk); mptcp_data_lock(sk); __mptcp_update_rmem(sk); done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); mptcp_data_unlock(sk); if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); } while (!done); /* acquire the data lock only if some input data is pending */ ret = moved > 0; if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || !skb_queue_empty_lockless(&sk->sk_receive_queue)) { mptcp_data_lock(sk); __mptcp_update_rmem(sk); ret |= __mptcp_ofo_queue(msk); __mptcp_splice_receive_queue(sk); mptcp_data_unlock(sk); } if (ret) mptcp_check_data_fin((struct sock *)msk); return !skb_queue_empty(&msk->receive_queue); } static unsigned int mptcp_inq_hint(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; skb = skb_peek(&msk->receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; if (hint_val >= INT_MAX) return INT_MAX; return (unsigned int)hint_val; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 1; return 0; } static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct mptcp_sock *msk = mptcp_sk(sk); struct scm_timestamping_internal tss; int copied = 0, cmsg_flags = 0; int target; long timeo; /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); if (unlikely(msk->recvmsg_inq)) cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; goto out_err; } copied += bytes_read; if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; /* only the MPTCP socket status is relevant here. The exit * conditions mirror closely tcp_recvmsg() */ if (copied >= target) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || signal_pending(current)) break; } else { if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) { /* race breaker: the shutdown could be after the * previous receive queue check */ if (__mptcp_move_skbs(msk)) continue; break; } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } pr_debug("block timeout %ld\n", timeo); mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; goto out_err; } } mptcp_cleanup_rbuf(msk, copied); out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (cmsg_flags & MPTCP_CMSG_INQ) { unsigned int inq = mptcp_inq_hint(sk); put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); } } pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); release_sock(sk); return copied; } static void mptcp_retransmit_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { /* we need a process context to retransmit */ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ __set_bit(MPTCP_RETRANSMIT, &msk->cb_flags); } bh_unlock_sock(sk); sock_put(sk); } static void mptcp_tout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); mptcp_schedule_work(sk); sock_put(sk); } /* Find an idle subflow. Return NULL if there is unacked data at tcp * level. * * A backup subflow is returned only if that is the only kind available. */ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!__mptcp_subflow_active(subflow)) continue; /* still data outstanding at TCP level? skip this */ if (!tcp_rtx_and_write_queues_empty(ssk)) { mptcp_pm_subflow_chk_stale(msk, ssk); min_stale_count = min_t(int, min_stale_count, subflow->stale_count); continue; } if (subflow->backup || subflow->request_bkup) { if (!backup) backup = ssk; continue; } if (!pick) pick = ssk; } if (pick) return pick; /* use backup only if there are no progresses anywhere */ return min_stale_count > 1 ? backup : NULL; } bool __mptcp_retransmit_pending_data(struct sock *sk) { struct mptcp_data_frag *cur, *rtx_head; struct mptcp_sock *msk = mptcp_sk(sk); if (__mptcp_check_fallback(msk)) return false; /* the closing socket has some data untransmitted and/or unacked: * some data in the mptcp rtx queue has not really xmitted yet. * keep it simple and re-inject the whole mptcp level rtx queue */ mptcp_data_lock(sk); __mptcp_clean_una_wakeup(sk); rtx_head = mptcp_rtx_head(sk); if (!rtx_head) { mptcp_data_unlock(sk); return false; } msk->recovery_snd_nxt = msk->snd_nxt; msk->recovery = true; mptcp_data_unlock(sk); msk->first_pending = rtx_head; msk->snd_burst = 0; /* be sure to clear the "sent status" on all re-injected fragments */ list_for_each_entry(cur, &msk->rtx_queue, list) { if (!cur->already_sent) break; cur->already_sent = 0; } return true; } /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) #define MPTCP_CF_FASTCLOSE BIT(2) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches * TCP_CLOSE state */ static void __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || (flags & MPTCP_CF_FASTCLOSE)) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ WARN_ON_ONCE(tcp_disconnect(ssk, 0)); mptcp_subflow_ctx_reset(subflow); } else { tcp_shutdown(ssk, SEND_SHUTDOWN); } } /* subflow sockets can be either outgoing (connect) or incoming * (accept). * * Outgoing subflows use in-kernel sockets. * Incoming subflows do not have their own 'struct socket' allocated, * so we need to use tcp_close() after detaching them from the mptcp * parent socket. */ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false; /* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is * already deleted by inet_child_forget() and the mptcp socket can't * survive too. */ if (msk->in_accept_queue && msk->first == ssk && (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) { /* ensure later check in mptcp_worker() will dispose the msk */ sock_set_flag(sk, SOCK_DEAD); mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); goto out_release; } dispose_it = msk->free_first || ssk != msk->first; if (dispose_it) list_del(&subflow->node); lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { /* be sure to force the tcp_close path * to generate the egress reset */ ssk->sk_lingertime = 0; sock_set_flag(ssk, SOCK_LINGER); subflow->send_fastclose = 1; } need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { __mptcp_subflow_disconnect(ssk, subflow, flags); release_sock(ssk); goto out; } subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops * the ssk has been already destroyed, we just need to release the * reference owned by msk; */ if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } out_release: __mptcp_subflow_error_report(sk, ssk); release_sock(ssk); sock_put(ssk); if (ssk == msk->first) WRITE_ONCE(msk->first, NULL); out: __mptcp_sync_sndbuf(sk); if (need_push) __mptcp_push_pending(sk, 0); /* Catch every 'all subflows closed' scenario, including peers silently * closing them, e.g. due to timeout. * For established sockets, allow an additional timeout before closing, * as the protocol can still create more subflows. */ if (list_is_singular(&msk->conn_list) && msk->first && inet_sk_state_load(msk->first) == TCP_CLOSE) { if (sk->sk_state != TCP_ESTABLISHED || msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) { mptcp_set_state(sk, TCP_CLOSE); mptcp_close_wake_up(sk); } else { mptcp_start_tout_timer(sk); } } } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { /* The first subflow can already be closed and still in the list */ if (subflow->close_event_done) return; subflow->close_event_done = true; if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); /* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow */ mptcp_pm_subflow_check_next(mptcp_sk(sk), subflow); __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_PUSH); } static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) { return 0; } static void __mptcp_close_subflow(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); might_sleep(); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int ssk_state = inet_sk_state_load(ssk); if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || inet_sk_state_load(sk) != TCP_ESTABLISHED)) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ if (!skb_queue_empty_lockless(&ssk->sk_receive_queue)) continue; mptcp_close_ssk(sk, ssk, subflow); } } static bool mptcp_close_tout_expired(const struct sock *sk) { if (!inet_csk(sk)->icsk_mtup.probe_timestamp || sk->sk_state == TCP_CLOSE) return false; return time_after32(tcp_jiffies32, inet_csk(sk)->icsk_mtup.probe_timestamp + mptcp_close_timeout(sk)); } static void mptcp_check_fastclose(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; if (likely(!READ_ONCE(msk->rcv_fastclose))) return; mptcp_token_destroy(msk); mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; slow = lock_sock_fast(tcp_sk); if (tcp_sk->sk_state != TCP_CLOSE) { mptcp_send_active_reset_reason(tcp_sk); tcp_set_state(tcp_sk, TCP_CLOSE); } unlock_sock_fast(tcp_sk, slow); } /* Mirror the tcp_reset() error propagation */ switch (sk->sk_state) { case TCP_SYN_SENT: WRITE_ONCE(sk->sk_err, ECONNREFUSED); break; case TCP_CLOSE_WAIT: WRITE_ONCE(sk->sk_err, EPIPE); break; case TCP_CLOSE: return; default: WRITE_ONCE(sk->sk_err, ECONNRESET); } mptcp_set_state(sk, TCP_CLOSE); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); /* the calling mptcp_worker will properly destroy the socket */ if (sock_flag(sk, SOCK_DEAD)) return; sk->sk_state_change(sk); sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; struct sock *ssk; int ret, err; u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_retransmits++; mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); goto reset_timer; } if (!mptcp_send_head(sk)) return; goto reset_timer; } if (err) goto reset_timer; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) { u16 copied = 0; mptcp_subflow_set_scheduled(subflow, false); ssk = mptcp_subflow_tcp_sock(subflow); lock_sock(ssk); /* limit retransmission to the bytes already sent on some subflows */ info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; while (info.sent < info.limit) { ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); copied += ret; info.sent += ret; } if (copied) { len = max(copied, len); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); WRITE_ONCE(msk->allow_infinite_fallback, false); } release_sock(ssk); } } msk->bytes_retrans += len; dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } /* schedule the timeout timer for the relevant event: either close timeout * or mp_fail timeout. The close timeout takes precedence on the mp_fail one */ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) { struct sock *sk = (struct sock *)msk; unsigned long timeout, close_timeout; if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp) return; close_timeout = (unsigned long)inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + mptcp_close_timeout(sk); /* the close timeout takes precedence on the fail one, and here at least one of * them is active */ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; sk_reset_timer(sk, &sk->sk_timer, timeout); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { struct sock *ssk = msk->first; bool slow; if (!ssk) return; pr_debug("MP_FAIL doesn't respond, reset the subflow\n"); slow = lock_sock_fast(ssk); mptcp_subflow_reset(ssk); WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); unlock_sock_fast(ssk, slow); } static void mptcp_do_fastclose(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, MPTCP_CF_FASTCLOSE); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = (struct sock *)msk; unsigned long fail_tout; int state; lock_sock(sk); state = sk->sk_state; if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_fastclose(msk); mptcp_pm_nl_work(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); mptcp_check_data_fin(sk); if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { mptcp_do_fastclose(sk); mptcp_close_wake_up(sk); } if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); goto unlock; } if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; if (fail_tout && time_after(jiffies, fail_tout)) mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); sock_put(sk); } static void __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; WRITE_ONCE(msk->rmem_fwd_alloc, 0); WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); WRITE_ONCE(msk->allow_infinite_fallback, true); msk->recovery = false; msk->subflow_id = 1; msk->last_data_sent = tcp_jiffies32; msk->last_data_recv = tcp_jiffies32; msk->last_ack_recv = tcp_jiffies32; mptcp_pm_data_init(msk); /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_assign_congestion_control(sk); strscpy(mptcp_sk(sk)->ca_name, icsk->icsk_ca_ops->name, sizeof(mptcp_sk(sk)->ca_name)); /* no need to keep a reference to the ops, the name will suffice */ tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = NULL; } static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); int ret; __mptcp_init_sock(sk); if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; rcu_read_lock(); ret = mptcp_init_sched(mptcp_sk(sk), mptcp_sched_find(mptcp_get_scheduler(net))); rcu_read_unlock(); if (ret) return ret; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); return 0; } static void __mptcp_clear_xmit(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; WRITE_ONCE(msk->first_pending, NULL); list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); if (cancel_work_sync(&msk->work)) __sock_put(sk); } void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) { lock_sock(ssk); switch (ssk->sk_state) { case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK)); break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { pr_debug("Fallback\n"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); /* simulate the data_fin ack reception to let the state * machine move forward */ WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); mptcp_schedule_work(sk); } else { pr_debug("Sending DATA_FIN on subflow %p\n", ssk); tcp_send_ack(ssk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); } break; } release_sock(ssk); } void mptcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); break; case TCP_CLOSE_WAIT: /* Unlike TCP, MPTCP sk would not have the TCP_SYN_RECV state: * MPTCP "accepted" sockets will be created later on. So no * transition from TCP_SYN_RECV to TCP_CLOSE_WAIT. */ break; default: if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) MPTCP_DEC_STATS(sock_net(sk), MPTCP_MIB_CURRESTAB); } inet_sk_state_store(sk, state); } static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, /* should not happen ! */ [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int mptcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; mptcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } static void mptcp_check_send_data_fin(struct sock *sk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n", msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), msk->snd_nxt, msk->write_seq); /* we still need to enqueue subflows or not really shutting down, * skip this */ if (!msk->snd_data_fin_enable || msk->snd_nxt + 1 != msk->write_seq || mptcp_send_head(sk)) return; WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); mptcp_subflow_shutdown(sk, tcp_sk, SEND_SHUTDOWN); } } static void __mptcp_wr_shutdown(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n", msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, !!mptcp_send_head(sk)); /* will be ignored by fallback sockets */ WRITE_ONCE(msk->write_seq, msk->write_seq + 1); WRITE_ONCE(msk->snd_data_fin_enable, 1); mptcp_check_send_data_fin(sk); } static void __mptcp_destroy_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p\n", msk); might_sleep(); mptcp_stop_rtx_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; mptcp_release_sched(msk); sk->sk_prot->destroy(sk); WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); sock_put(sk); } void __mptcp_unaccepted_force_close(struct sock *sk) { sock_set_flag(sk, SOCK_DEAD); mptcp_do_fastclose(sk); __mptcp_destroy_sock(sk); } static __poll_t mptcp_check_readable(struct sock *sk) { return mptcp_epollin_ready(sk) ? EPOLLIN | EPOLLRDNORM : 0; } static void mptcp_check_listen_stop(struct sock *sk) { struct sock *ssk; if (inet_sk_state_load(sk) != TCP_LISTEN) return; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); ssk = mptcp_sk(sk)->first; if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN)) return; lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); tcp_set_state(ssk, TCP_CLOSE); mptcp_subflow_queue_clean(sk, ssk); inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); release_sock(ssk); } bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; int subflows_alive = 0; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); goto cleanup; } if (mptcp_data_avail(msk) || timeout < 0) { /* If the msk has read data, or the caller explicitly ask it, * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose */ mptcp_do_fastclose(sk); timeout = 0; } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); } sk_stream_wait_close(sk, timeout); cleanup: /* orphan all the subflows */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); subflows_alive += ssk->sk_state != TCP_CLOSE; /* since the close timeout takes precedence on the fail one, * cancel the latter */ if (ssk == msk->first) subflow->fail_tout = 0; /* detach from the parent socket, but allow data_ready to * push incoming data into the mptcp stack, to properly ack it */ ssk->sk_socket = NULL; ssk->sk_wq = NULL; unlock_sock_fast(ssk, slow); } sock_orphan(sk); /* all the subflows are closed, only timeout can change the msk * state, let's not keep resources busy for no reasons */ if (subflows_alive == 0) mptcp_set_state(sk, TCP_CLOSE); sock_hold(sk); pr_debug("msk=%p state=%d\n", sk, sk->sk_state); mptcp_pm_connection_closed(msk); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { mptcp_start_tout_timer(sk); } return do_cancel_work; } static void mptcp_close(struct sock *sk, long timeout) { bool do_cancel_work; lock_sock(sk); do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); sock_put(sk); } static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); struct ipv6_pinfo *msk6 = inet6_sk(msk); msk->sk_v6_daddr = ssk->sk_v6_daddr; msk->sk_v6_rcv_saddr = ssk->sk_v6_rcv_saddr; if (msk6 && ssk6) { msk6->saddr = ssk6->saddr; msk6->flow_label = ssk6->flow_label; } #endif inet_sk(msk)->inet_num = inet_sk(ssk)->inet_num; inet_sk(msk)->inet_dport = inet_sk(ssk)->inet_dport; inet_sk(msk)->inet_sport = inet_sk(ssk)->inet_sport; inet_sk(msk)->inet_daddr = inet_sk(ssk)->inet_daddr; inet_sk(msk)->inet_saddr = inet_sk(ssk)->inet_saddr; inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); /* We are on the fastopen error path. We can't call straight into the * subflows cleanup code due to lock nesting (we are already under * msk->firstsocket lock). */ if (msk->fastopening) return -EBUSY; mptcp_check_listen_stop(sk); mptcp_set_state(sk, TCP_CLOSE); mptcp_stop_rtx_timer(sk); mptcp_stop_tout_timer(sk); mptcp_pm_connection_closed(msk); /* msk->subflow is still intact, the following will not free the first * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->recovery = false; WRITE_ONCE(msk->can_ack, false); WRITE_ONCE(msk->fully_established, false); WRITE_ONCE(msk->rcv_data_fin, false); WRITE_ONCE(msk->snd_data_fin_enable, false); WRITE_ONCE(msk->rcv_fastclose, false); WRITE_ONCE(msk->use_64bit_ack, false); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); msk->bytes_consumed = 0; msk->bytes_acked = 0; msk->bytes_received = 0; msk->bytes_sent = 0; msk->bytes_retrans = 0; msk->rcvspace_init = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) { const struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; struct ipv6_pinfo *newnp; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) net_warn_ratelimited("%s: Failed to copy ip6 options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } #endif static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) { struct ip_options_rcu *inet_opt, *newopt = NULL; const struct inet_sock *inet = inet_sk(sk); struct inet_sock *newinet; newinet = inet_sk(newsk); rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); if (newopt) memcpy(newopt, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); else net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); rcu_read_unlock(); } struct sock *mptcp_sk_clone_init(const struct sock *sk, const struct mptcp_options_received *mp_opt, struct sock *ssk, struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!nsk) return NULL; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk); #endif __mptcp_init_sock(nsk); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (nsk->sk_family == AF_INET6) mptcp_copy_ip6_options(nsk, sk); else #endif mptcp_copy_ip_options(nsk, sk); msk = mptcp_sk(nsk); WRITE_ONCE(msk->local_key, subflow_req->local_key); WRITE_ONCE(msk->token, subflow_req->token); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq); WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; sock_reset_flag(nsk, SOCK_RCU_FREE); security_inet_csk_clone(nsk, req); /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ mptcp_set_state(nsk, TCP_ESTABLISHED); /* The msk maintain a ref to each subflow in the connections list */ WRITE_ONCE(msk->first, ssk); subflow = mptcp_subflow_ctx(ssk); list_add(&subflow->node, &msk->conn_list); sock_hold(ssk); /* new mpc subflow takes ownership of the newly * created mptcp socket */ mptcp_token_accept(subflow_req, msk); /* set msk addresses early to ensure mptcp_pm_get_local_id() * uses the correct data */ mptcp_copy_inaddrs(nsk, ssk); __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) __mptcp_subflow_fully_established(msk, subflow, mp_opt); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ return nsk; } void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) { const struct tcp_sock *tp = tcp_sk(ssk); msk->rcvspace_init = 1; msk->rcvq_space.copied = 0; msk->rcvq_space.rtt_us = 0; msk->rcvq_space.time = tp->tcp_mstamp; /* initial rcv_space offering made to peer */ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; __mptcp_clear_xmit(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ mptcp_data_lock(sk); skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); WRITE_ONCE(msk->rmem_fwd_alloc, 0); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); mptcp_free_local_addr_list(msk); } static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* allow the following to close even the initial subflow */ msk->free_first = 1; mptcp_destroy_common(msk, 0); sk_sockets_allocated_dec(sk); } void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) __mptcp_clean_una(sk); else __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); } void __mptcp_check_push(struct sock *sk, struct sock *ssk) { if (!mptcp_send_head(sk)) return; if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); } #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ BIT(MPTCP_FLUSH_JOIN_LIST)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) __must_hold(&sk->sk_lock.slock) { struct mptcp_sock *msk = mptcp_sk(sk); for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); struct list_head join_list; if (!flags) break; INIT_LIST_HEAD(&join_list); list_splice_init(&msk->join_list, &join_list); /* the following actions acquire the subflow socket lock * * 1) can't be invoked in atomic scope * 2) must avoid ABBA deadlock with msk socket spinlock: the RX * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); if (flags & BIT(MPTCP_FLUSH_JOIN_LIST)) __mptcp_flush_join_list(sk, &join_list); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); cond_resched(); spin_lock_bh(&sk->sk_lock.slock); } if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) __mptcp_clean_una_wakeup(sk); if (unlikely(msk->cb_flags)) { /* be sure to sync the msk state before taking actions * depending on sk_state (MPTCP_ERROR_REPORT) * On sk release avoid actions depending on the first subflow */ if (__test_and_clear_bit(MPTCP_SYNC_STATE, &msk->cb_flags) && msk->first) __mptcp_sync_state(sk, msk->pending_state); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } __mptcp_update_rmem(sk); } /* MP_JOIN client subflow must wait for 4th ack before sending any data: * TCP can't schedule delack timer before the subflow is fully established. * MPTCP uses the delack timer to do 3rd ack retransmissions */ static void schedule_3rdack_retransmission(struct sock *ssk) { struct inet_connection_sock *icsk = inet_csk(ssk); struct tcp_sock *tp = tcp_sk(ssk); unsigned long timeout; if (READ_ONCE(mptcp_subflow_ctx(ssk)->fully_established)) return; /* reschedule with a timeout above RTT, as we must look only for drop */ if (tp->srtt_us) timeout = usecs_to_jiffies(tp->srtt_us >> (3 - 1)); else timeout = TCP_TIMEOUT_INIT; timeout += jiffies; WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } void mptcp_subflow_process_delegated(struct sock *ssk, long status) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = subflow->conn; if (status & BIT(MPTCP_DELEGATE_SEND)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_SNDBUF)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) __mptcp_sync_sndbuf(sk); else __set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } if (status & BIT(MPTCP_DELEGATE_ACK)) schedule_3rdack_retransmission(ssk); } static int mptcp_hash(struct sock *sk) { /* should never be called, * we hash the TCP subflows not the MPTCP socket */ WARN_ON_ONCE(1); return 0; } static void mptcp_unhash(struct sock *sk) { /* called from sk_common_release(), but nothing to do here */ } static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); pr_debug("msk=%p, ssk=%p\n", msk, msk->first); if (WARN_ON_ONCE(!msk->first)) return -EINVAL; return inet_csk_get_port(msk->first, snum); } void mptcp_finish_connect(struct sock *ssk) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; subflow = mptcp_subflow_ctx(ssk); sk = subflow->conn; msk = mptcp_sk(sk); pr_debug("msk=%p, token=%u\n", sk, subflow->token); subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); mptcp_pm_new_connection(msk, ssk, 0); } void mptcp_sock_graft(struct sock *sk, struct socket *parent) { write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; write_unlock_bh(&sk->sk_callback_lock); } bool mptcp_finish_join(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct sock *parent = (void *)msk; bool ret = true; pr_debug("msk=%p, subflow=%p\n", msk, subflow); /* mptcp socket already closing? */ if (!mptcp_is_fully_established(parent)) { subflow->reset_reason = MPTCP_RST_EMPTCP; return false; } /* active subflow, already present inside the conn_list */ if (!list_empty(&subflow->node)) { mptcp_subflow_joined(msk, ssk); mptcp_propagate_sndbuf(parent, ssk); return true; } if (!mptcp_pm_allow_new_subflow(msk)) goto err_prohibited; /* If we can't acquire msk socket lock here, let the release callback * handle it */ mptcp_data_lock(parent); if (!sock_owned_by_user(parent)) { ret = __mptcp_finish_join(msk, ssk); if (ret) { sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); } } else { sock_hold(ssk); list_add_tail(&subflow->node, &msk->join_list); __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); } mptcp_data_unlock(parent); if (!ret) { err_prohibited: subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; } return true; } static void mptcp_shutdown(struct sock *sk, int how) { pr_debug("sk=%p, how=%d\n", sk, how); if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) __mptcp_wr_shutdown(sk); } static int mptcp_forward_alloc_get(const struct sock *sk) { return READ_ONCE(sk->sk_forward_alloc) + READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); } static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; u64 delta; if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) return 0; delta = msk->write_seq - v; if (__mptcp_check_fallback(msk) && msk->first) { struct tcp_sock *tp = tcp_sk(msk->first); /* the first subflow is disconnected after close - see * __mptcp_close_ssk(). tcp_disconnect() moves the write_seq * so ignore that status, too. */ if (!((1 << msk->first->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))) delta += READ_ONCE(tp->write_seq) - tp->snd_una; } if (delta > INT_MAX) delta = INT_MAX; return (int)delta; } static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) { struct mptcp_sock *msk = mptcp_sk(sk); bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; lock_sock(sk); __mptcp_move_skbs(msk); *karg = mptcp_inq_hint(sk); release_sock(sk); break; case SIOCOUTQ: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); unlock_sock_fast(sk, slow); break; case SIOCOUTQNSD: slow = lock_sock_fast(sk); *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); unlock_sock_fast(sk, slow); break; default: return -ENOIOCTLCMD; } return 0; } static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); int err = -EINVAL; struct sock *ssk; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_set_state(sk, TCP_SYN_SENT); subflow = mptcp_subflow_ctx(ssk); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of * TCP option space. */ if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif if (subflow->request_mptcp) { if (mptcp_active_should_disable(sk)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED); mptcp_subflow_early_fallback(msk, subflow); } else if (mptcp_token_new_connect(ssk) < 0) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); } } WRITE_ONCE(msk->write_seq, subflow->idsn); WRITE_ONCE(msk->snd_nxt, subflow->idsn); WRITE_ONCE(msk->snd_una, subflow->idsn); if (likely(!__mptcp_check_fallback(msk))) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); /* if reaching here via the fastopen/sendmsg path, the caller already * acquired the subflow socket lock, too. */ if (!msk->fastopening) lock_sock(ssk); /* the following mirrors closely a very small chunk of code from * __inet_stream_connect() */ if (ssk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(ssk)) { err = ssk->sk_prot->pre_connect(ssk, uaddr, addr_len); if (err) goto out; } err = ssk->sk_prot->connect(ssk, uaddr, addr_len); if (err < 0) goto out; inet_assign_bit(DEFER_CONNECT, sk, inet_test_bit(DEFER_CONNECT, ssk)); out: if (!msk->fastopening) release_sock(ssk); /* on successful connect, the msk state will be moved to established by * subflow_finish_connect() */ if (unlikely(err)) { /* avoid leaving a dangling token in an unconnected socket */ mptcp_token_destroy(msk); mptcp_set_state(sk, TCP_CLOSE); return err; } mptcp_copy_inaddrs(sk, ssk); return 0; } static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, .destroy = mptcp_destroy, .sendmsg = mptcp_sendmsg, .ioctl = mptcp_ioctl, .recvmsg = mptcp_recvmsg, .release_cb = mptcp_release_cb, .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, .forward_alloc_get = mptcp_forward_alloc_get, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .no_autobind = true, }; static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; int err = -EINVAL; lock_sock(sk); ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } if (sk->sk_family == AF_INET) err = inet_bind_sk(ssk, uaddr, addr_len); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (sk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, uaddr, addr_len); #endif if (!err) mptcp_copy_inaddrs(sk, ssk); unlock: release_sock(sk); return err; } static int mptcp_listen(struct socket *sock, int backlog) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *sk = sock->sk; struct sock *ssk; int err; pr_debug("msk=%p\n", msk); lock_sock(sk); err = -EINVAL; if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto unlock; ssk = __mptcp_nmpc_sk(msk); if (IS_ERR(ssk)) { err = PTR_ERR(ssk); goto unlock; } mptcp_set_state(sk, TCP_LISTEN); sock_set_flag(sk, SOCK_RCU_FREE); lock_sock(ssk); err = __inet_listen_sk(ssk, backlog); release_sock(ssk); mptcp_set_state(sk, inet_sk_state_load(ssk)); if (!err) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); mptcp_copy_inaddrs(sk, ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); } unlock: release_sock(sk); return err; } static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; pr_debug("msk=%p\n", msk); /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. */ ssk = READ_ONCE(msk->first); if (!ssk) return -EINVAL; pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk)); newsk = inet_csk_accept(ssk, arg); if (!newsk) return arg->err; pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; subflow = mptcp_subflow_ctx(newsk); new_mptcp_sock = subflow->conn; /* is_mptcp should be false if subflow->conn is missing, see * subflow_syn_recv_sock() */ if (WARN_ON_ONCE(!new_mptcp_sock)) { tcp_sk(newsk)->is_mptcp = 0; goto tcpfallback; } newsk = new_mptcp_sock; MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); msk = mptcp_sk(newsk); msk->in_accept_queue = 0; /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { __mptcp_close_ssk(newsk, msk->first, mptcp_subflow_ctx(msk->first), 0); if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); } } else { tcpfallback: newsk->sk_kern_sock = arg->kern; lock_sock(newsk); __inet_accept(sock, newsock, newsk); /* we are being invoked after accepting a non-mp-capable * flow: sk is a tcp_sk, not an mptcp one. * * Hand the socket over to tcp so all further socket ops * bypass mptcp. */ WRITE_ONCE(newsock->sk->sk_socket->ops, mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk); return 0; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); smp_mb__after_atomic(); /* NOSPACE is changed by mptcp_write_space() */ if (__mptcp_stream_is_writeable(sk, 1)) return EPOLLOUT | EPOLLWRNORM; return 0; } static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct sock *sk = sock->sk; struct mptcp_sock *msk; __poll_t mask = 0; u8 shutdown; int state; msk = mptcp_sk(sk); sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags); if (state == TCP_LISTEN) { struct sock *ssk = READ_ONCE(msk->first); if (WARN_ON_ONCE(!ssk)) return 0; return inet_csk_listen_poll(ssk); } shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(sk); if (shutdown & SEND_SHUTDOWN) mask |= EPOLLOUT | EPOLLWRNORM; else mask |= mptcp_check_writeable(msk); } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* cf tcp_poll() note about TFO */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; return mask; } static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet_getname, .poll = mptcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, }; static struct inet_protosw mptcp_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_prot, .ops = &mptcp_stream_ops, .flags = INET_PROTOSW_ICSK, }; static int mptcp_napi_poll(struct napi_struct *napi, int budget) { struct mptcp_delegated_action *delegated; struct mptcp_subflow_context *subflow; int work_done = 0; delegated = container_of(napi, struct mptcp_delegated_action, napi); while ((subflow = mptcp_subflow_delegated_next(delegated)) != NULL) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bh_lock_sock_nested(ssk); if (!sock_owned_by_user(ssk)) { mptcp_subflow_process_delegated(ssk, xchg(&subflow->delegated_status, 0)); } else { /* tcp_release_cb_override already processed * the action or will do at next release_sock(). * In both case must dequeue the subflow here - on the same * CPU that scheduled it. */ smp_wmb(); clear_bit(MPTCP_DELEGATE_SCHEDULED, &subflow->delegated_status); } bh_unlock_sock(ssk); sock_put(ssk); if (++work_done == budget) return budget; } /* always provide a 0 'work_done' argument, so that napi_complete_done * will not try accessing the NULL napi->dev ptr */ napi_complete_done(napi, 0); return work_done; } void __init mptcp_proto_init(void) { struct mptcp_delegated_action *delegated; int cpu; mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); mptcp_napi_dev = alloc_netdev_dummy(0); if (!mptcp_napi_dev) panic("Failed to allocate MPTCP dummy netdev\n"); for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); netif_napi_add_tx(mptcp_napi_dev, &delegated->napi, mptcp_napi_poll); napi_enable(&delegated->napi); } mptcp_subflow_init(); mptcp_pm_init(); mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); inet_register_protosw(&mptcp_protosw); BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb)); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) static const struct proto_ops mptcp_v6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = mptcp_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = mptcp_stream_accept, .getname = inet6_getname, .poll = mptcp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = mptcp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet6_sendmsg, .recvmsg = inet6_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, }; static struct proto mptcp_v6_prot; static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, .prot = &mptcp_v6_prot, .ops = &mptcp_v6_stream_ops, .flags = INET_PROTOSW_ICSK, }; int __init mptcp_proto_v6_init(void) { int err; mptcp_v6_prot = mptcp_prot; strscpy(mptcp_v6_prot.name, "MPTCPv6", sizeof(mptcp_v6_prot.name)); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) return err; err = inet6_register_protosw(&mptcp_v6_protosw); if (err) proto_unregister(&mptcp_v6_prot); return err; } #endif
2 1 1 1 1 2 1 10 1 8 1 1 4 2 2 1 1 3 5 1 4 3 2 5 4 2 3 3 2 1 27 4 22 8 13 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2008 Red Hat, Inc. All rights reserved. * Copyright 2008 Ian Kent <raven@themaw.net> */ #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/compat.h> #include <linux/fdtable.h> #include <linux/magic.h> #include <linux/nospec.h> #include "autofs_i.h" /* * This module implements an interface for routing autofs ioctl control * commands via a miscellaneous device file. * * The alternate interface is needed because we need to be able open * an ioctl file descriptor on an autofs mount that may be covered by * another mount. This situation arises when starting automount(8) * or other user space daemon which uses direct mounts or offset * mounts (used for autofs lazy mount/umount of nested mount trees), * which have been left busy at service shutdown. */ typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, struct autofs_dev_ioctl *); static int check_name(const char *name) { if (!strchr(name, '/')) return -EINVAL; return 0; } /* * Check a string doesn't overrun the chunk of * memory we copied from user land. */ static int invalid_str(char *str, size_t size) { if (memchr(str, 0, size)) return 0; return -EINVAL; } /* * Check that the user compiled against correct version of autofs * misc device code. * * As well as checking the version compatibility this always copies * the kernel interface version out. */ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) { int err = 0; if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { pr_warn("ioctl control interface version mismatch: " "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); err = -EINVAL; } /* Fill in the kernel version. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return err; } /* * Copy parameter control struct, including a possible path allocated * at the end of the struct. */ static struct autofs_dev_ioctl * copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) return ERR_PTR(-EFAULT); if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) return ERR_PTR(-EINVAL); if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) return ERR_PTR(-ENAMETOOLONG); res = memdup_user(in, tmp.size); if (!IS_ERR(res)) res->size = tmp.size; return res; } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) { kfree(param); } /* * Check sanity of parameter control fields and if a path is present * check that it is terminated and contains at least one "/". */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { unsigned int inr = _IOC_NR(cmd); int err; err = check_dev_ioctl_version(cmd, param); if (err) { pr_warn("invalid device control module version " "supplied for cmd(0x%08x)\n", cmd); goto out; } if (param->size > AUTOFS_DEV_IOCTL_SIZE) { err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); if (err) { pr_warn( "path string terminator missing for cmd(0x%08x)\n", cmd); goto out; } /* Setting the per-dentry expire timeout requires a trailing * path component, ie. no '/', so invert the logic of the * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD. */ err = check_name(param->path); if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD) err = err ? 0 : -EINVAL; if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", cmd); goto out; } } else { if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { err = -EINVAL; goto out; } } err = 0; out: return err; } /* Return autofs dev ioctl version */ static int autofs_dev_ioctl_version(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { /* This should have already been set. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return 0; } /* Return autofs module protocol version */ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protover.version = sbi->version; return 0; } /* Return autofs module protocol sub version */ static int autofs_dev_ioctl_protosubver(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protosubver.sub_version = sbi->sub_version; return 0; } /* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, int test(const struct path *path, void *data), void *data) { struct path path; int err; err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path); if (err) return err; err = -ENOENT; while (path.dentry == path.mnt->mnt_root) { if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); *res = path; err = 0; break; } } if (!follow_up(&path)) break; } path_put(&path); return err; } static int test_by_dev(const struct path *path, void *p) { return path->dentry->d_sb->s_dev == *(dev_t *)p; } static int test_by_type(const struct path *path, void *p) { struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } /* * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). */ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { int err, fd; fd = get_unused_fd_flags(O_CLOEXEC); if (likely(fd >= 0)) { struct file *filp; struct path path; err = find_autofs_mount(name, &path, test_by_dev, &devid); if (err) goto out; filp = dentry_open(&path, O_RDONLY, current_cred()); path_put(&path); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } fd_install(fd, filp); } return fd; out: put_unused_fd(fd); return err; } /* Open a file descriptor on an autofs mount point */ static int autofs_dev_ioctl_openmount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { const char *path; dev_t devid; int err, fd; /* param->path has been checked in validate_dev_ioctl() */ if (!param->openmount.devid) return -EINVAL; param->ioctlfd = -1; path = param->path; devid = new_decode_dev(param->openmount.devid); err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); if (unlikely(fd < 0)) { err = fd; goto out; } param->ioctlfd = fd; out: return err; } /* Close file descriptor allocated above (user can also use close(2)). */ static int autofs_dev_ioctl_closemount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { return close_fd(param->ioctlfd); } /* * Send "ready" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_ready(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; return autofs_wait_release(sbi, token, 0); } /* * Send "fail" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_fail(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; int status; token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs_wait_release(sbi, token, status); } /* * Set the pipe fd for kernel communication to the daemon. * * Normally this is set at mount using an option but if we * are reconnecting to a busy mount then we need to use this * to tell the autofs mount about the new kernel pipe fd. In * order to protect mounts against incorrectly setting the * pipefd we also require that the autofs mount be catatonic. * * This also sets the process group id used to identify the * controlling process (eg. the owning automount(8) daemon). */ static int autofs_dev_ioctl_setpipefd(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { int pipefd; int err = 0; struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) { mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { struct file *pipe; new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { pr_warn("not allowed to change PID namespace\n"); err = -EINVAL; goto out; } pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; } if (autofs_prepare_pipe(pipe) < 0) { err = -EPIPE; fput(pipe); goto out; } swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->flags &= ~AUTOFS_SBI_CATATONIC; } out: put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } /* * Make the autofs mount point catatonic, no longer responsive to * mount requests. Also closes the kernel pipe file descriptor. */ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_catatonic_mode(sbi); return 0; } /* * Set the autofs mount expire timeout. * * There are two places an expire timeout can be set, in the autofs * super block info. (this is all that's needed for direct and offset * mounts because there's a distinct mount corresponding to each of * these) and per-dentry within within the dentry info. If a per-dentry * timeout is set it will override the expire timeout set in the parent * autofs super block info. * * If setting the autofs super block expire timeout the autofs_dev_ioctl * size field will be equal to the autofs_dev_ioctl structure size. If * setting the per-dentry expire timeout the mount point name is passed * in the autofs_dev_ioctl path field and the size field updated to * reflect this. * * Setting the autofs mount expire timeout sets the timeout in the super * block info. struct. Setting the per-dentry timeout does a little more. * If the timeout is equal to -1 the per-dentry timeout (and flag) is * cleared which reverts to using the super block timeout, otherwise if * timeout is 0 the timeout is set to this value and the flag is left * set which disables expiration for the mount point, lastly the flag * and the timeout are set enabling the dentry to use this timeout. */ static int autofs_dev_ioctl_timeout(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { unsigned long timeout = param->timeout.timeout; /* If setting the expire timeout for an individual indirect * mount point dentry the mount trailing component path is * placed in param->path and param->size adjusted to account * for it otherwise param->size it is set to the structure * size. */ if (param->size == AUTOFS_DEV_IOCTL_SIZE) { param->timeout.timeout = sbi->exp_timeout / HZ; sbi->exp_timeout = timeout * HZ; } else { struct dentry *base = fp->f_path.dentry; struct inode *inode = base->d_inode; int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; struct dentry *dentry; struct autofs_info *ino; if (!autofs_type_indirect(sbi->type)) return -EINVAL; /* An expire timeout greater than the superblock timeout * could be a problem at shutdown but the super block * timeout itself can change so all we can really do is * warn the user. */ if (timeout >= sbi->exp_timeout) pr_warn("per-mount expire timeout is greater than " "the parent autofs mount timeout which could " "prevent shutdown\n"); inode_lock_shared(inode); dentry = try_lookup_one_len(param->path, base, path_len); inode_unlock_shared(inode); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); if (!ino) { dput(dentry); return -ENOENT; } if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET) param->timeout.timeout = ino->exp_timeout / HZ; else param->timeout.timeout = sbi->exp_timeout / HZ; if (timeout == -1) { /* Revert to using the super block timeout */ ino->flags &= ~AUTOFS_INF_EXPIRE_SET; ino->exp_timeout = 0; } else { /* Set the dentry expire flag and timeout. * * If timeout is 0 it will prevent the expire * of this particular automount. */ ino->flags |= AUTOFS_INF_EXPIRE_SET; ino->exp_timeout = timeout * HZ; } dput(dentry); } return 0; } /* * Return the uid and gid of the last request for the mount * * When reconstructing an autofs mount tree with active mounts * we need to re-connect to mounts that may have used the original * process uid and gid (or string variations of them) for mount * lookups within the map entry. */ static int autofs_dev_ioctl_requester(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct autofs_info *ino; struct path path; dev_t devid; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ devid = sbi->sb->s_dev; param->requester.uid = param->requester.gid = -1; err = find_autofs_mount(param->path, &path, test_by_dev, &devid); if (err) goto out; ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); out: return err; } /* * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more that can be done. */ static int autofs_dev_ioctl_expire(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct vfsmount *mnt; int how; how = param->expire.how; mnt = fp->f_path.mnt; return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ static int autofs_dev_ioctl_askumount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->askumount.may_umount = 0; if (may_umount(fp->f_path.mnt)) param->askumount.may_umount = 1; return 0; } /* * Check if the given path is a mountpoint. * * If we are supplied with the file descriptor of an autofs * mount we're looking for a specific mount. In this case * the path is considered a mountpoint if it is itself a * mountpoint or contains a mount, such as a multi-mount * without a root mount. In this case we return 1 if the * path is a mount point and the super magic of the covering * mount if there is one or 0 if it isn't a mountpoint. * * If we aren't supplied with a file descriptor then we * lookup the path and check if it is the root of a mount. * If a type is given we are looking for a particular autofs * mount and if we don't find a match we return fail. If the * located path is the root of a mount we return 1 along with * the super magic of the mount or 0 otherwise. * * In both cases the device number (as returned by * new_encode_dev()) is also returned. */ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct path path; const char *name; unsigned int type; unsigned int devid, magic; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ name = param->path; type = param->ismountpoint.in.type; param->ismountpoint.out.devid = devid = 0; param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT, &path); else err = find_autofs_mount(name, &path, test_by_type, &type); if (err) goto out; devid = new_encode_dev(path.dentry->d_sb->s_dev); err = 0; if (path.mnt->mnt_root == path.dentry) { err = 1; magic = path.dentry->d_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; err = find_autofs_mount(name, &path, test_by_dev, &dev); if (err) goto out; devid = new_encode_dev(dev); err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; } param->ismountpoint.out.devid = devid; param->ismountpoint.out.magic = magic; path_put(&path); out: return err; } /* * Our range of ioctl numbers isn't 0 based so we need to shift * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table * lookup. */ #define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, autofs_dev_ioctl_openmount, autofs_dev_ioctl_closemount, autofs_dev_ioctl_ready, autofs_dev_ioctl_fail, autofs_dev_ioctl_setpipefd, autofs_dev_ioctl_catatonic, autofs_dev_ioctl_timeout, autofs_dev_ioctl_requester, autofs_dev_ioctl_expire, autofs_dev_ioctl_askumount, autofs_dev_ioctl_ismountpoint, }; unsigned int idx = cmd_idx(cmd); if (idx >= ARRAY_SIZE(_ioctls)) return NULL; idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); return _ioctls[idx]; } /* ioctl dispatcher */ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) { struct autofs_dev_ioctl *param; struct file *fp; struct autofs_sb_info *sbi; unsigned int cmd_first, cmd; ioctl_fn fn = NULL; int err = 0; cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { return -ENOTTY; } /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && !capable(CAP_SYS_ADMIN)) return -EPERM; /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) return PTR_ERR(param); err = validate_dev_ioctl(command, param); if (err) goto out; fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); err = -ENOTTY; goto out; } fp = NULL; sbi = NULL; /* * For obvious reasons the openmount can't have a file * descriptor yet. We don't take a reference to the * file during close to allow for immediate release, * and the same for retrieving ioctl version. */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { struct super_block *sb; fp = fget(param->ioctlfd); if (!fp) { if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) goto cont; err = -EBADF; goto out; } sb = file_inode(fp)->i_sb; if (sb->s_type != &autofs_fs_type) { err = -EINVAL; fput(fp); goto out; } sbi = autofs_sbi(sb); /* * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); goto out; } } cont: err = fn(fp, sbi, param); if (fp) fput(fp); if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) err = -EFAULT; out: free_dev_ioctl(param); return err; } static long autofs_dev_ioctl(struct file *file, unsigned int command, unsigned long u) { int err; err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); return (long) err; } #ifdef CONFIG_COMPAT static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, unsigned long u) { return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); } #else #define autofs_dev_ioctl_compat NULL #endif static const struct file_operations _dev_ioctl_fops = { .unlocked_ioctl = autofs_dev_ioctl, .compat_ioctl = autofs_dev_ioctl_compat, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops, .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); MODULE_ALIAS("devname:autofs"); /* Register/deregister misc character device */ int __init autofs_dev_ioctl_init(void) { int r; r = misc_register(&_autofs_dev_ioctl_misc); if (r) { pr_err("misc_register failed for control device\n"); return r; } return 0; } void autofs_dev_ioctl_exit(void) { misc_deregister(&_autofs_dev_ioctl_misc); }
2 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct wol_req_info { struct ethnl_req_info base; }; struct wol_reply_data { struct ethnl_reply_data base; struct ethtool_wolinfo wol; bool show_sopass; }; #define WOL_REPDATA(__reply_base) \ container_of(__reply_base, struct wol_reply_data, base) const struct nla_policy ethnl_wol_get_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int wol_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct wol_reply_data *data = WOL_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_wol(dev, &data->wol); ethnl_ops_complete(dev); /* do not include password in notifications */ data->show_sopass = !genl_info_is_ntf(info) && (data->wol.supported & WAKE_MAGICSECURE); return 0; } static int wol_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int len; len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (len < 0) return len; if (data->show_sopass) len += nla_total_size(sizeof(data->wol.sopass)); return len; } static int wol_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (ret < 0) return ret; if (data->show_sopass && nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass), data->wol.sopass)) return -EMSGSIZE; return 0; } /* WOL_SET */ const struct nla_policy ethnl_wol_set_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED }, [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY, .len = SOPASS_MAX }, }; static int ethnl_set_wol_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_wol && ops->set_wol ? 1 : -EOPNOTSUPP; } static int ethnl_set_wol(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_wol(dev, &wol); ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, tb[ETHTOOL_A_WOL_MODES], wol_mode_names, info->extack, &mod); if (ret < 0) return ret; if (wol.wolopts & ~wol.supported) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], "cannot enable unsupported WoL mode"); return -EINVAL; } if (tb[ETHTOOL_A_WOL_SOPASS]) { if (!(wol.supported & WAKE_MAGICSECURE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_SOPASS], "magicsecure not supported, cannot set password"); return -EINVAL; } ethnl_update_binary(wol.sopass, sizeof(wol.sopass), tb[ETHTOOL_A_WOL_SOPASS], &mod); } if (!mod) return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; dev->ethtool->wol_enabled = !!wol.wolopts; return 1; } const struct ethnl_request_ops ethnl_wol_request_ops = { .request_cmd = ETHTOOL_MSG_WOL_GET, .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, .hdr_attr = ETHTOOL_A_WOL_HEADER, .req_info_size = sizeof(struct wol_req_info), .reply_data_size = sizeof(struct wol_reply_data), .prepare_data = wol_prepare_data, .reply_size = wol_reply_size, .fill_reply = wol_fill_reply, .set_validate = ethnl_set_wol_validate, .set = ethnl_set_wol, .set_ntf_cmd = ETHTOOL_MSG_WOL_NTF, };
4 4 4 4 4 21 4 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2020-2022, Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_ag.h" #include "xfs_iunlink_item.h" #include "xfs_trace.h" #include "xfs_error.h" struct kmem_cache *xfs_iunlink_cache; static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_iunlink_item, item); } static void xfs_iunlink_item_release( struct xfs_log_item *lip) { struct xfs_iunlink_item *iup = IUL_ITEM(lip); xfs_perag_put(iup->pag); kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip)); } static uint64_t xfs_iunlink_item_sort( struct xfs_log_item *lip) { return IUL_ITEM(lip)->ip->i_ino; } /* * Look up the inode cluster buffer and log the on-disk unlinked inode change * we need to make. */ static int xfs_iunlink_log_dinode( struct xfs_trans *tp, struct xfs_iunlink_item *iup) { struct xfs_inode *ip = iup->ip; struct xfs_dinode *dip; struct xfs_buf *ibp; xfs_agino_t old_ptr; int offset; int error; error = xfs_imap_to_bp(tp->t_mountp, tp, &ip->i_imap, &ibp); if (error) return error; /* * Don't log the unlinked field on stale buffers as this may be the * transaction that frees the inode cluster and relogging the buffer * here will incorrectly remove the stale state. */ if (ibp->b_flags & XBF_STALE) goto out; dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); /* Make sure the old pointer isn't garbage. */ old_ptr = be32_to_cpu(dip->di_next_unlinked); if (old_ptr != iup->old_agino) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; goto out; } trace_xfs_iunlink_update_dinode(iup, old_ptr); dip->di_next_unlinked = cpu_to_be32(iup->next_agino); offset = ip->i_imap.im_boffset + offsetof(struct xfs_dinode, di_next_unlinked); xfs_dinode_calc_crc(tp->t_mountp, dip); xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); return 0; out: xfs_trans_brelse(tp, ibp); return error; } /* * On precommit, we grab the inode cluster buffer for the inode number we were * passed, then update the next unlinked field for that inode in the buffer and * log the buffer. This ensures that the inode cluster buffer was logged in the * correct order w.r.t. other inode cluster buffers. We can then remove the * iunlink item from the transaction and release it as it is has now served it's * purpose. */ static int xfs_iunlink_item_precommit( struct xfs_trans *tp, struct xfs_log_item *lip) { struct xfs_iunlink_item *iup = IUL_ITEM(lip); int error; error = xfs_iunlink_log_dinode(tp, iup); list_del(&lip->li_trans); xfs_iunlink_item_release(lip); return error; } static const struct xfs_item_ops xfs_iunlink_item_ops = { .iop_release = xfs_iunlink_item_release, .iop_sort = xfs_iunlink_item_sort, .iop_precommit = xfs_iunlink_item_precommit, }; /* * Initialize the inode log item for a newly allocated (in-core) inode. * * Inode extents can only reside within an AG. Hence specify the starting * block for the inode chunk by offset within an AG as well as the * length of the allocated extent. * * This joins the item to the transaction and marks it dirty so * that we don't need a separate call to do this, nor does the * caller need to know anything about the iunlink item. */ int xfs_iunlink_log_inode( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_perag *pag, xfs_agino_t next_agino) { struct xfs_mount *mp = tp->t_mountp; struct xfs_iunlink_item *iup; ASSERT(xfs_verify_agino_or_null(pag, next_agino)); ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked)); /* * Since we're updating a linked list, we should never find that the * current pointer is the same as the new value, unless we're * terminating the list. */ if (ip->i_next_unlinked == next_agino) { if (next_agino != NULLAGINO) return -EFSCORRUPTED; return 0; } iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK, &xfs_iunlink_item_ops); iup->ip = ip; iup->next_agino = next_agino; iup->old_agino = ip->i_next_unlinked; iup->pag = xfs_perag_hold(pag); xfs_trans_add_item(tp, &iup->item); tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &iup->item.li_flags); return 0; }
2 17 17 2 2 2 17 17 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 // SPDX-License-Identifier: GPL-2.0 #include <linux/pci.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/wait.h> #include "pci.h" /* * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ DEFINE_RAW_SPINLOCK(pci_lock); /* * Wrappers for all PCI configuration access functions. They just check * alignment, do locking and call the low-level functions pointed to * by pci_dev->ops. */ #define PCI_byte_BAD 0 #define PCI_word_BAD (pos & 1) #define PCI_dword_BAD (pos & 3) #ifdef CONFIG_PCI_LOCKLESS_CONFIG # define pci_lock_config(f) do { (void)(f); } while (0) # define pci_unlock_config(f) do { (void)(f); } while (0) #else # define pci_lock_config(f) raw_spin_lock_irqsave(&pci_lock, f) # define pci_unlock_config(f) raw_spin_unlock_irqrestore(&pci_lock, f) #endif #define PCI_OP_READ(size, type, len) \ int noinline pci_bus_read_config_##size \ (struct pci_bus *bus, unsigned int devfn, int pos, type *value) \ { \ unsigned long flags; \ u32 data = 0; \ int res; \ \ if (PCI_##size##_BAD) \ return PCIBIOS_BAD_REGISTER_NUMBER; \ \ pci_lock_config(flags); \ res = bus->ops->read(bus, devfn, pos, len, &data); \ if (res) \ PCI_SET_ERROR_RESPONSE(value); \ else \ *value = (type)data; \ pci_unlock_config(flags); \ \ return res; \ } #define PCI_OP_WRITE(size, type, len) \ int noinline pci_bus_write_config_##size \ (struct pci_bus *bus, unsigned int devfn, int pos, type value) \ { \ unsigned long flags; \ int res; \ \ if (PCI_##size##_BAD) \ return PCIBIOS_BAD_REGISTER_NUMBER; \ \ pci_lock_config(flags); \ res = bus->ops->write(bus, devfn, pos, len, value); \ pci_unlock_config(flags); \ \ return res; \ } PCI_OP_READ(byte, u8, 1) PCI_OP_READ(word, u16, 2) PCI_OP_READ(dword, u32, 4) PCI_OP_WRITE(byte, u8, 1) PCI_OP_WRITE(word, u16, 2) PCI_OP_WRITE(dword, u32, 4) EXPORT_SYMBOL(pci_bus_read_config_byte); EXPORT_SYMBOL(pci_bus_read_config_word); EXPORT_SYMBOL(pci_bus_read_config_dword); EXPORT_SYMBOL(pci_bus_write_config_byte); EXPORT_SYMBOL(pci_bus_write_config_word); EXPORT_SYMBOL(pci_bus_write_config_dword); int pci_generic_config_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { void __iomem *addr; addr = bus->ops->map_bus(bus, devfn, where); if (!addr) return PCIBIOS_DEVICE_NOT_FOUND; if (size == 1) *val = readb(addr); else if (size == 2) *val = readw(addr); else *val = readl(addr); return PCIBIOS_SUCCESSFUL; } EXPORT_SYMBOL_GPL(pci_generic_config_read); int pci_generic_config_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { void __iomem *addr; addr = bus->ops->map_bus(bus, devfn, where); if (!addr) return PCIBIOS_DEVICE_NOT_FOUND; if (size == 1) writeb(val, addr); else if (size == 2) writew(val, addr); else writel(val, addr); return PCIBIOS_SUCCESSFUL; } EXPORT_SYMBOL_GPL(pci_generic_config_write); int pci_generic_config_read32(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { void __iomem *addr; addr = bus->ops->map_bus(bus, devfn, where & ~0x3); if (!addr) return PCIBIOS_DEVICE_NOT_FOUND; *val = readl(addr); if (size <= 2) *val = (*val >> (8 * (where & 3))) & ((1 << (size * 8)) - 1); return PCIBIOS_SUCCESSFUL; } EXPORT_SYMBOL_GPL(pci_generic_config_read32); int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { void __iomem *addr; u32 mask, tmp; addr = bus->ops->map_bus(bus, devfn, where & ~0x3); if (!addr) return PCIBIOS_DEVICE_NOT_FOUND; if (size == 4) { writel(val, addr); return PCIBIOS_SUCCESSFUL; } /* * In general, hardware that supports only 32-bit writes on PCI is * not spec-compliant. For example, software may perform a 16-bit * write. If the hardware only supports 32-bit accesses, we must * do a 32-bit read, merge in the 16 bits we intend to write, * followed by a 32-bit write. If the 16 bits we *don't* intend to * write happen to have any RW1C (write-one-to-clear) bits set, we * just inadvertently cleared something we shouldn't have. */ if (!bus->unsafe_warn) { dev_warn(&bus->dev, "%d-byte config write to %04x:%02x:%02x.%d offset %#x may corrupt adjacent RW1C bits\n", size, pci_domain_nr(bus), bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn), where); bus->unsafe_warn = 1; } mask = ~(((1 << (size * 8)) - 1) << ((where & 0x3) * 8)); tmp = readl(addr) & mask; tmp |= val << ((where & 0x3) * 8); writel(tmp, addr); return PCIBIOS_SUCCESSFUL; } EXPORT_SYMBOL_GPL(pci_generic_config_write32); /** * pci_bus_set_ops - Set raw operations of pci bus * @bus: pci bus struct * @ops: new raw operations * * Return previous raw operations */ struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops) { struct pci_ops *old_ops; unsigned long flags; raw_spin_lock_irqsave(&pci_lock, flags); old_ops = bus->ops; bus->ops = ops; raw_spin_unlock_irqrestore(&pci_lock, flags); return old_ops; } EXPORT_SYMBOL(pci_bus_set_ops); /* * The following routines are to prevent the user from accessing PCI config * space when it's unsafe to do so. Some devices require this during BIST and * we're required to prevent it during D-state transitions. * * We have a bit per device to indicate it's blocked and a global wait queue * for callers to sleep on until devices are unblocked. */ static DECLARE_WAIT_QUEUE_HEAD(pci_cfg_wait); static noinline void pci_wait_cfg(struct pci_dev *dev) __must_hold(&pci_lock) { do { raw_spin_unlock_irq(&pci_lock); wait_event(pci_cfg_wait, !dev->block_cfg_access); raw_spin_lock_irq(&pci_lock); } while (dev->block_cfg_access); } /* Returns 0 on success, negative values indicate error. */ #define PCI_USER_READ_CONFIG(size, type) \ int pci_user_read_config_##size \ (struct pci_dev *dev, int pos, type *val) \ { \ u32 data = -1; \ int ret; \ \ if (PCI_##size##_BAD) \ return -EINVAL; \ \ raw_spin_lock_irq(&pci_lock); \ if (unlikely(dev->block_cfg_access)) \ pci_wait_cfg(dev); \ ret = dev->bus->ops->read(dev->bus, dev->devfn, \ pos, sizeof(type), &data); \ raw_spin_unlock_irq(&pci_lock); \ if (ret) \ PCI_SET_ERROR_RESPONSE(val); \ else \ *val = (type)data; \ \ return pcibios_err_to_errno(ret); \ } \ EXPORT_SYMBOL_GPL(pci_user_read_config_##size); /* Returns 0 on success, negative values indicate error. */ #define PCI_USER_WRITE_CONFIG(size, type) \ int pci_user_write_config_##size \ (struct pci_dev *dev, int pos, type val) \ { \ int ret; \ \ if (PCI_##size##_BAD) \ return -EINVAL; \ \ raw_spin_lock_irq(&pci_lock); \ if (unlikely(dev->block_cfg_access)) \ pci_wait_cfg(dev); \ ret = dev->bus->ops->write(dev->bus, dev->devfn, \ pos, sizeof(type), val); \ raw_spin_unlock_irq(&pci_lock); \ \ return pcibios_err_to_errno(ret); \ } \ EXPORT_SYMBOL_GPL(pci_user_write_config_##size); PCI_USER_READ_CONFIG(byte, u8) PCI_USER_READ_CONFIG(word, u16) PCI_USER_READ_CONFIG(dword, u32) PCI_USER_WRITE_CONFIG(byte, u8) PCI_USER_WRITE_CONFIG(word, u16) PCI_USER_WRITE_CONFIG(dword, u32) /** * pci_cfg_access_lock - Lock PCI config reads/writes * @dev: pci device struct * * When access is locked, any userspace reads or writes to config * space and concurrent lock requests will sleep until access is * allowed via pci_cfg_access_unlock() again. */ void pci_cfg_access_lock(struct pci_dev *dev) { might_sleep(); raw_spin_lock_irq(&pci_lock); if (dev->block_cfg_access) pci_wait_cfg(dev); dev->block_cfg_access = 1; raw_spin_unlock_irq(&pci_lock); } EXPORT_SYMBOL_GPL(pci_cfg_access_lock); /** * pci_cfg_access_trylock - try to lock PCI config reads/writes * @dev: pci device struct * * Same as pci_cfg_access_lock, but will return 0 if access is * already locked, 1 otherwise. This function can be used from * atomic contexts. */ bool pci_cfg_access_trylock(struct pci_dev *dev) { unsigned long flags; bool locked = true; raw_spin_lock_irqsave(&pci_lock, flags); if (dev->block_cfg_access) locked = false; else dev->block_cfg_access = 1; raw_spin_unlock_irqrestore(&pci_lock, flags); return locked; } EXPORT_SYMBOL_GPL(pci_cfg_access_trylock); /** * pci_cfg_access_unlock - Unlock PCI config reads/writes * @dev: pci device struct * * This function allows PCI config accesses to resume. */ void pci_cfg_access_unlock(struct pci_dev *dev) { unsigned long flags; raw_spin_lock_irqsave(&pci_lock, flags); /* * This indicates a problem in the caller, but we don't need * to kill them, unlike a double-block above. */ WARN_ON(!dev->block_cfg_access); dev->block_cfg_access = 0; raw_spin_unlock_irqrestore(&pci_lock, flags); wake_up_all(&pci_cfg_wait); } EXPORT_SYMBOL_GPL(pci_cfg_access_unlock); static inline int pcie_cap_version(const struct pci_dev *dev) { return pcie_caps_reg(dev) & PCI_EXP_FLAGS_VERS; } bool pcie_cap_has_lnkctl(const struct pci_dev *dev) { int type = pci_pcie_type(dev); return type == PCI_EXP_TYPE_ENDPOINT || type == PCI_EXP_TYPE_LEG_END || type == PCI_EXP_TYPE_ROOT_PORT || type == PCI_EXP_TYPE_UPSTREAM || type == PCI_EXP_TYPE_DOWNSTREAM || type == PCI_EXP_TYPE_PCI_BRIDGE || type == PCI_EXP_TYPE_PCIE_BRIDGE; } bool pcie_cap_has_lnkctl2(const struct pci_dev *dev) { return pcie_cap_has_lnkctl(dev) && pcie_cap_version(dev) > 1; } static inline bool pcie_cap_has_sltctl(const struct pci_dev *dev) { return pcie_downstream_port(dev) && pcie_caps_reg(dev) & PCI_EXP_FLAGS_SLOT; } bool pcie_cap_has_rtctl(const struct pci_dev *dev) { int type = pci_pcie_type(dev); return type == PCI_EXP_TYPE_ROOT_PORT || type == PCI_EXP_TYPE_RC_EC; } static bool pcie_capability_reg_implemented(struct pci_dev *dev, int pos) { if (!pci_is_pcie(dev)) return false; switch (pos) { case PCI_EXP_FLAGS: return true; case PCI_EXP_DEVCAP: case PCI_EXP_DEVCTL: case PCI_EXP_DEVSTA: return true; case PCI_EXP_LNKCAP: case PCI_EXP_LNKCTL: case PCI_EXP_LNKSTA: return pcie_cap_has_lnkctl(dev); case PCI_EXP_SLTCAP: case PCI_EXP_SLTCTL: case PCI_EXP_SLTSTA: return pcie_cap_has_sltctl(dev); case PCI_EXP_RTCTL: case PCI_EXP_RTCAP: case PCI_EXP_RTSTA: return pcie_cap_has_rtctl(dev); case PCI_EXP_DEVCAP2: case PCI_EXP_DEVCTL2: return pcie_cap_version(dev) > 1; case PCI_EXP_LNKCAP2: case PCI_EXP_LNKCTL2: case PCI_EXP_LNKSTA2: return pcie_cap_has_lnkctl2(dev); default: return false; } } /* * Note that these accessor functions are only for the "PCI Express * Capability" (see PCIe spec r3.0, sec 7.8). They do not apply to the * other "PCI Express Extended Capabilities" (AER, VC, ACS, MFVC, etc.) */ int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val) { int ret; *val = 0; if (pos & 1) return PCIBIOS_BAD_REGISTER_NUMBER; if (pcie_capability_reg_implemented(dev, pos)) { ret = pci_read_config_word(dev, pci_pcie_cap(dev) + pos, val); /* * Reset *val to 0 if pci_read_config_word() fails; it may * have been written as 0xFFFF (PCI_ERROR_RESPONSE) if the * config read failed on PCI. */ if (ret) *val = 0; return ret; } /* * For Functions that do not implement the Slot Capabilities, * Slot Status, and Slot Control registers, these spaces must * be hardwired to 0b, with the exception of the Presence Detect * State bit in the Slot Status register of Downstream Ports, * which must be hardwired to 1b. (PCIe Base Spec 3.0, sec 7.8) */ if (pci_is_pcie(dev) && pcie_downstream_port(dev) && pos == PCI_EXP_SLTSTA) *val = PCI_EXP_SLTSTA_PDS; return 0; } EXPORT_SYMBOL(pcie_capability_read_word); int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val) { int ret; *val = 0; if (pos & 3) return PCIBIOS_BAD_REGISTER_NUMBER; if (pcie_capability_reg_implemented(dev, pos)) { ret = pci_read_config_dword(dev, pci_pcie_cap(dev) + pos, val); /* * Reset *val to 0 if pci_read_config_dword() fails; it may * have been written as 0xFFFFFFFF (PCI_ERROR_RESPONSE) if * the config read failed on PCI. */ if (ret) *val = 0; return ret; } if (pci_is_pcie(dev) && pcie_downstream_port(dev) && pos == PCI_EXP_SLTSTA) *val = PCI_EXP_SLTSTA_PDS; return 0; } EXPORT_SYMBOL(pcie_capability_read_dword); int pcie_capability_write_word(struct pci_dev *dev, int pos, u16 val) { if (pos & 1) return PCIBIOS_BAD_REGISTER_NUMBER; if (!pcie_capability_reg_implemented(dev, pos)) return 0; return pci_write_config_word(dev, pci_pcie_cap(dev) + pos, val); } EXPORT_SYMBOL(pcie_capability_write_word); int pcie_capability_write_dword(struct pci_dev *dev, int pos, u32 val) { if (pos & 3) return PCIBIOS_BAD_REGISTER_NUMBER; if (!pcie_capability_reg_implemented(dev, pos)) return 0; return pci_write_config_dword(dev, pci_pcie_cap(dev) + pos, val); } EXPORT_SYMBOL(pcie_capability_write_dword); int pcie_capability_clear_and_set_word_unlocked(struct pci_dev *dev, int pos, u16 clear, u16 set) { int ret; u16 val; ret = pcie_capability_read_word(dev, pos, &val); if (ret) return ret; val &= ~clear; val |= set; return pcie_capability_write_word(dev, pos, val); } EXPORT_SYMBOL(pcie_capability_clear_and_set_word_unlocked); int pcie_capability_clear_and_set_word_locked(struct pci_dev *dev, int pos, u16 clear, u16 set) { unsigned long flags; int ret; spin_lock_irqsave(&dev->pcie_cap_lock, flags); ret = pcie_capability_clear_and_set_word_unlocked(dev, pos, clear, set); spin_unlock_irqrestore(&dev->pcie_cap_lock, flags); return ret; } EXPORT_SYMBOL(pcie_capability_clear_and_set_word_locked); int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos, u32 clear, u32 set) { int ret; u32 val; ret = pcie_capability_read_dword(dev, pos, &val); if (ret) return ret; val &= ~clear; val |= set; return pcie_capability_write_dword(dev, pos, val); } EXPORT_SYMBOL(pcie_capability_clear_and_set_dword); int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val) { if (pci_dev_is_disconnected(dev)) { PCI_SET_ERROR_RESPONSE(val); return PCIBIOS_DEVICE_NOT_FOUND; } return pci_bus_read_config_byte(dev->bus, dev->devfn, where, val); } EXPORT_SYMBOL(pci_read_config_byte); int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val) { if (pci_dev_is_disconnected(dev)) { PCI_SET_ERROR_RESPONSE(val); return PCIBIOS_DEVICE_NOT_FOUND; } return pci_bus_read_config_word(dev->bus, dev->devfn, where, val); } EXPORT_SYMBOL(pci_read_config_word); int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val) { if (pci_dev_is_disconnected(dev)) { PCI_SET_ERROR_RESPONSE(val); return PCIBIOS_DEVICE_NOT_FOUND; } return pci_bus_read_config_dword(dev->bus, dev->devfn, where, val); } EXPORT_SYMBOL(pci_read_config_dword); int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val) { if (pci_dev_is_disconnected(dev)) return PCIBIOS_DEVICE_NOT_FOUND; return pci_bus_write_config_byte(dev->bus, dev->devfn, where, val); } EXPORT_SYMBOL(pci_write_config_byte); int pci_write_config_word(const struct pci_dev *dev, int where, u16 val) { if (pci_dev_is_disconnected(dev)) return PCIBIOS_DEVICE_NOT_FOUND; return pci_bus_write_config_word(dev->bus, dev->devfn, where, val); } EXPORT_SYMBOL(pci_write_config_word); int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val) { if (pci_dev_is_disconnected(dev)) return PCIBIOS_DEVICE_NOT_FOUND; return pci_bus_write_config_dword(dev->bus, dev->devfn, where, val); } EXPORT_SYMBOL(pci_write_config_dword); void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos, u32 clear, u32 set) { u32 val; pci_read_config_dword(dev, pos, &val); val &= ~clear; val |= set; pci_write_config_dword(dev, pos, val); } EXPORT_SYMBOL(pci_clear_and_set_config_dword);
399 13 406 34 6 294 295 294 378 1 488 490 489 10 504 2 490 490 490 6 2 489 490 490 486 12 490 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_IO_H #define _BCACHEFS_BTREE_IO_H #include "bkey_methods.h" #include "bset.h" #include "btree_locking.h" #include "checksum.h" #include "extents.h" #include "io_write_types.h" struct bch_fs; struct btree_write; struct btree; struct btree_iter; struct btree_node_read_all; static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) atomic_long_inc(&c->btree_cache.nr_dirty); } static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) atomic_long_dec(&c->btree_cache.nr_dirty); } static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) { return k.k->type == KEY_TYPE_btree_ptr_v2 ? le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors_written) : 0; } struct btree_read_bio { struct bch_fs *c; struct btree *b; struct btree_node_read_all *ra; u64 start_time; unsigned have_ioref:1; unsigned idx:7; struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; }; struct btree_write_bio { struct work_struct work; __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); void *data; unsigned data_bytes; unsigned sector_offset; struct bch_write_bio wbio; }; void bch2_btree_node_io_unlock(struct btree *); void bch2_btree_node_io_lock(struct btree *); void __bch2_btree_node_wait_on_read(struct btree *); void __bch2_btree_node_wait_on_write(struct btree *); void bch2_btree_node_wait_on_read(struct btree *); void bch2_btree_node_wait_on_write(struct btree *); enum compact_mode { COMPACT_LAZY, COMPACT_ALL, }; bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode); static inline bool should_compact_bset_lazy(struct btree *b, struct bset_tree *t) { unsigned total_u64s = bset_u64s(t); unsigned dead_u64s = bset_dead_u64s(b, t); return dead_u64s > 64 && dead_u64s * 3 > total_u64s; } static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) { for_each_bset(b, t) if (should_compact_bset_lazy(b, t)) return bch2_compact_whiteouts(c, b, COMPACT_LAZY); return false; } static inline struct nonce btree_nonce(struct bset *i, unsigned offset) { return (struct nonce) {{ [0] = cpu_to_le32(offset), [1] = ((__le32 *) &i->seq)[0], [2] = ((__le32 *) &i->seq)[1], [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, }}; } static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) { struct nonce nonce = btree_nonce(i, offset); int ret; if (!offset) { struct btree_node *bn = container_of(i, struct btree_node, keys); unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, bytes); if (ret) return ret; nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); } return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, vstruct_end(i) - (void *) i->_data); } void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); void bch2_btree_node_drop_keys_outside_node(struct btree *); void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, struct btree *, bool, bool *); void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, __BTREE_WRITE_ALREADY_STARTED, }; #define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) #define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, enum six_lock_type, unsigned); void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, enum six_lock_type, unsigned); static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, enum six_lock_type lock_held) { bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); } bool bch2_btree_flush_all_reads(struct bch_fs *); bool bch2_btree_flush_all_writes(struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, int write, struct bkey_format *f) { if (version < bcachefs_metadata_version_inode_btree_change && btree_id == BTREE_ID_inodes) { swap(f->bits_per_field[BKEY_FIELD_INODE], f->bits_per_field[BKEY_FIELD_OFFSET]); swap(f->field_offset[BKEY_FIELD_INODE], f->field_offset[BKEY_FIELD_OFFSET]); } if (version < bcachefs_metadata_version_snapshot && (level || btree_type_has_snapshots(btree_id))) { u64 max_packed = ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); f->field_offset[BKEY_FIELD_SNAPSHOT] = write ? 0 : cpu_to_le64(U32_MAX - max_packed); } } static inline void compat_bpos(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, int write, struct bpos *p) { if (big_endian != CPU_BIG_ENDIAN) bch2_bpos_swab(p); if (version < bcachefs_metadata_version_inode_btree_change && btree_id == BTREE_ID_inodes) swap(p->inode, p->offset); } static inline void compat_btree_node(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, int write, struct btree_node *bn) { if (version < bcachefs_metadata_version_inode_btree_change && btree_id_is_extents(btree_id) && !bpos_eq(bn->min_key, POS_MIN) && write) bn->min_key = bpos_nosnap_predecessor(bn->min_key); if (version < bcachefs_metadata_version_snapshot && write) bn->max_key.snapshot = 0; compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); if (version < bcachefs_metadata_version_snapshot && !write) bn->max_key.snapshot = U32_MAX; if (version < bcachefs_metadata_version_inode_btree_change && btree_id_is_extents(btree_id) && !bpos_eq(bn->min_key, POS_MIN) && !write) bn->min_key = bpos_nosnap_successor(bn->min_key); } void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); #endif /* _BCACHEFS_BTREE_IO_H */
93 13 12 80 4 78 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2021 Oracle Corporation */ #include <linux/slab.h> #include <linux/completion.h> #include <linux/sched/task.h> #include <linux/sched/vhost_task.h> #include <linux/sched/signal.h> enum vhost_task_flags { VHOST_TASK_FLAGS_STOP, VHOST_TASK_FLAGS_KILLED, }; struct vhost_task { bool (*fn)(void *data); void (*handle_sigkill)(void *data); void *data; struct completion exited; unsigned long flags; struct task_struct *task; /* serialize SIGKILL and vhost_task_stop calls */ struct mutex exit_mutex; }; static int vhost_task_fn(void *data) { struct vhost_task *vtsk = data; for (;;) { bool did_work; if (signal_pending(current)) { struct ksignal ksig; if (get_signal(&ksig)) break; } /* mb paired w/ vhost_task_stop */ set_current_state(TASK_INTERRUPTIBLE); if (test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) { __set_current_state(TASK_RUNNING); break; } did_work = vtsk->fn(vtsk->data); if (!did_work) schedule(); } mutex_lock(&vtsk->exit_mutex); /* * If a vhost_task_stop and SIGKILL race, we can ignore the SIGKILL. * When the vhost layer has called vhost_task_stop it's already stopped * new work and flushed. */ if (!test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) { set_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags); vtsk->handle_sigkill(vtsk->data); } mutex_unlock(&vtsk->exit_mutex); complete(&vtsk->exited); do_exit(0); } /** * vhost_task_wake - wakeup the vhost_task * @vtsk: vhost_task to wake * * wake up the vhost_task worker thread */ void vhost_task_wake(struct vhost_task *vtsk) { wake_up_process(vtsk->task); } EXPORT_SYMBOL_GPL(vhost_task_wake); /** * vhost_task_stop - stop a vhost_task * @vtsk: vhost_task to stop * * vhost_task_fn ensures the worker thread exits after * VHOST_TASK_FLAGS_STOP becomes true. */ void vhost_task_stop(struct vhost_task *vtsk) { mutex_lock(&vtsk->exit_mutex); if (!test_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags)) { set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags); vhost_task_wake(vtsk); } mutex_unlock(&vtsk->exit_mutex); /* * Make sure vhost_task_fn is no longer accessing the vhost_task before * freeing it below. */ wait_for_completion(&vtsk->exited); kfree(vtsk); } EXPORT_SYMBOL_GPL(vhost_task_stop); /** * vhost_task_create - create a copy of a task to be used by the kernel * @fn: vhost worker function * @handle_sigkill: vhost function to handle when we are killed * @arg: data to be passed to fn and handled_kill * @name: the thread's name * * This returns a specialized task for use by the vhost layer or NULL on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ struct vhost_task *vhost_task_create(bool (*fn)(void *), void (*handle_sigkill)(void *), void *arg, const char *name) { struct kernel_clone_args args = { .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM | CLONE_THREAD | CLONE_SIGHAND, .exit_signal = 0, .fn = vhost_task_fn, .name = name, .user_worker = 1, .no_files = 1, }; struct vhost_task *vtsk; struct task_struct *tsk; vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL); if (!vtsk) return NULL; init_completion(&vtsk->exited); mutex_init(&vtsk->exit_mutex); vtsk->data = arg; vtsk->fn = fn; vtsk->handle_sigkill = handle_sigkill; args.fn_arg = vtsk; tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); return NULL; } vtsk->task = tsk; return vtsk; } EXPORT_SYMBOL_GPL(vhost_task_create); /** * vhost_task_start - start a vhost_task created with vhost_task_create * @vtsk: vhost_task to wake up */ void vhost_task_start(struct vhost_task *vtsk) { wake_up_new_task(vtsk->task); } EXPORT_SYMBOL_GPL(vhost_task_start);
20 79 20 3 70 95 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 /* SPDX-License-Identifier: GPL-2.0-only */ /* * ocfs2_fs.h * * On-disk structures for OCFS2. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H #include <linux/magic.h> /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 /* * An OCFS2 volume starts this way: * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. * * All other structures are found from the superblock information. * * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a * blocksize of 2K, it is 4096 bytes into disk. */ #define OCFS2_SUPER_BLOCK_BLKNO 2 /* * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could * grow if needed. */ #define OCFS2_MIN_CLUSTERSIZE 4096 #define OCFS2_MAX_CLUSTERSIZE 1048576 /* * Blocks cannot be bigger than clusters, so the maximum blocksize is the * minimum cluster size. */ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" #define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" #define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" #define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" #define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_compat & (mask) ) #define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) #define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) #define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_compat |= (mask) #define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_ro_compat |= (mask) #define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat |= (mask) #define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_compat &= ~(mask) #define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) #define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) #define OCFS2_FEATURE_COMPAT_SUPP (OCFS2_FEATURE_COMPAT_BACKUP_SB \ | OCFS2_FEATURE_COMPAT_JBD2_SB) #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ | OCFS2_FEATURE_INCOMPAT_XATTR \ | OCFS2_FEATURE_INCOMPAT_META_ECC \ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The * filesystem driver can't load them, but the library can. Never put * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. */ #define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 /* * tunefs sets this incompat flag before starting the resize and clears it * at the end. This flag protects users from inadvertently mounting the fs * after an aborted run without fsck-ing. */ #define OCFS2_FEATURE_INCOMPAT_RESIZE_INPROG 0x0004 /* Used to denote a non-clustered volume */ #define OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT 0x0008 /* Support for sparse allocation in b-trees */ #define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC 0x0010 /* * Tunefs sets this incompat flag before starting an operation which * would require cleanup on abort. This is done to protect users from * inadvertently mounting the fs after an aborted run without * fsck-ing. * * s_tunefs_flags on the super block describes precisely which * operations were in progress. */ #define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG 0x0020 /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 /* * Support for alternate, userspace cluster stacks. If set, the superblock * field s_cluster_info contains a tag for the alternate stack in use as * well as the name of the cluster being joined. * mount.ocfs2 must pass in a matching stack name. * * If not set, the classic stack will be used. This is compatible with * all older versions. */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 /* Support for the extended slot map */ #define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 /* Support for indexed directories */ #define OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS 0x0400 /* Metadata checksum and error correction */ #define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 /* Refcount tree support */ #define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000 /* Discontiguous block groups */ #define OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG 0x2000 /* * Incompat bit to indicate usable clusterinfo with stackflags for all * cluster stacks (userspace adnd o2cb). If this bit is set, * INCOMPAT_USERSPACE_STACK becomes superfluous and thus should not be set. */ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 /* * Append Direct IO support */ #define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 /* * backup superblock flag is used to indicate that this volume * has backup superblocks. */ #define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001 /* * The filesystem will correctly handle journal feature bits. */ #define OCFS2_FEATURE_COMPAT_JBD2_SB 0x0002 /* * Unwritten extents support. */ #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 /* * Maintain quota information for this filesystem */ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ #define OCFS2_BACKUP_SB_START 1 << 30 /* the max backup superblock nums */ #define OCFS2_MAX_BACKUP_SUPERBLOCKS 6 /* * Flags on ocfs2_super_block.s_tunefs_flags */ #define OCFS2_TUNEFS_INPROG_REMOVE_SLOT 0x0001 /* Removing slots */ /* * Flags on ocfs2_dinode.i_flags */ #define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ #define OCFS2_UNUSED2_FL (0x00000002) #define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ #define OCFS2_UNUSED3_FL (0x00000008) /* System inode flags */ #define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ #define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ #define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ #define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ #define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ #define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ #define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features * * These can change much more often than i_flags. When adding flags, * keep in mind that i_dyn_features is only 16 bits wide. */ #define OCFS2_INLINE_DATA_FL (0x0001) /* Data stored in inode block */ #define OCFS2_HAS_XATTR_FL (0x0002) #define OCFS2_INLINE_XATTR_FL (0x0004) #define OCFS2_INDEXED_DIR_FL (0x0008) #define OCFS2_HAS_REFCOUNT_FL (0x0010) /* Inode attributes, keep in sync with EXT2 */ #define OCFS2_SECRM_FL FS_SECRM_FL /* Secure deletion */ #define OCFS2_UNRM_FL FS_UNRM_FL /* Undelete */ #define OCFS2_COMPR_FL FS_COMPR_FL /* Compress file */ #define OCFS2_SYNC_FL FS_SYNC_FL /* Synchronous updates */ #define OCFS2_IMMUTABLE_FL FS_IMMUTABLE_FL /* Immutable file */ #define OCFS2_APPEND_FL FS_APPEND_FL /* writes to file may only append */ #define OCFS2_NODUMP_FL FS_NODUMP_FL /* do not dump file */ #define OCFS2_NOATIME_FL FS_NOATIME_FL /* do not update atime */ /* Reserved for compression usage... */ #define OCFS2_DIRTY_FL FS_DIRTY_FL #define OCFS2_COMPRBLK_FL FS_COMPRBLK_FL /* One or more compressed clusters */ #define OCFS2_NOCOMP_FL FS_NOCOMP_FL /* Don't compress */ #define OCFS2_ECOMPR_FL FS_ECOMPR_FL /* Compression error */ /* End compression flags --- maybe not all used */ #define OCFS2_BTREE_FL FS_BTREE_FL /* btree format dir */ #define OCFS2_INDEX_FL FS_INDEX_FL /* hash-indexed directory */ #define OCFS2_IMAGIC_FL FS_IMAGIC_FL /* AFS directory */ #define OCFS2_JOURNAL_DATA_FL FS_JOURNAL_DATA_FL /* Reserved for ext3 */ #define OCFS2_NOTAIL_FL FS_NOTAIL_FL /* file tail should not be merged */ #define OCFS2_DIRSYNC_FL FS_DIRSYNC_FL /* dirsync behaviour (directories only) */ #define OCFS2_TOPDIR_FL FS_TOPDIR_FL /* Top of directory hierarchies*/ #define OCFS2_RESERVED_FL FS_RESERVED_FL /* reserved for ext2 lib */ #define OCFS2_FL_VISIBLE FS_FL_USER_VISIBLE /* User visible flags */ #define OCFS2_FL_MODIFIABLE FS_FL_USER_MODIFIABLE /* User modifiable flags */ /* * Extent record flags (e_node.leaf.flags) */ #define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but * unwritten */ #define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference * counted in an associated * refcount tree */ /* * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) */ #define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ /* * superblock s_state flags */ #define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ /* Limit of space in ocfs2_dir_entry */ #define OCFS2_MAX_FILENAME_LEN 255 /* Maximum slots on an ocfs2 file system */ #define OCFS2_MAX_SLOTS 255 /* Slot map indicator for an empty slot */ #define OCFS2_INVALID_SLOT ((u16)-1) #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 /* The cluster stack fields */ #define OCFS2_STACK_LABEL_LEN 4 #define OCFS2_CLUSTER_NAME_LEN 16 /* Classic (historically speaking) cluster stack */ #define OCFS2_CLASSIC_CLUSTER_STACK "o2cb" /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) /* * Inline extended attribute size (in bytes) * The value chosen should be aligned to 16 byte boundaries. */ #define OCFS2_MIN_XATTR_INLINE_SIZE 256 /* * Cluster info flags (ocfs2_cluster_info.ci_stackflags) */ #define OCFS2_CLUSTER_O2CB_GLOBAL_HEARTBEAT (0x01) struct ocfs2_system_inode_info { char *si_name; int si_iflags; int si_mode; }; /* System file index */ enum { BAD_BLOCK_SYSTEM_INODE = 0, GLOBAL_INODE_ALLOC_SYSTEM_INODE, #define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE, HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, USER_QUOTA_SYSTEM_INODE, GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE #define OCFS2_FIRST_LOCAL_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE, EXTENT_ALLOC_SYSTEM_INODE, INODE_ALLOC_SYSTEM_INODE, JOURNAL_SYSTEM_INODE, LOCAL_ALLOC_SYSTEM_INODE, TRUNCATE_LOG_SYSTEM_INODE, LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE, #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; #define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE #define NUM_LOCAL_SYSTEM_INODES \ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { /* Global system inodes (single copy) */ /* The first two are only used from userspace mfks/tunefs */ [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, /* These are used by the running filesystem */ [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 }, [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 }, /* Slot-specific system inodes (one copy per slot) */ [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }, [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, }; /* Parameter passed from mount.ocfs2 to module */ #define OCFS2_HB_NONE "heartbeat=none" #define OCFS2_HB_LOCAL "heartbeat=local" #define OCFS2_HB_GLOBAL "heartbeat=global" /* * OCFS2_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 */ #define OCFS2_DIR_PAD 4 #define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) #define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) #define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ OCFS2_DIR_ROUND) & \ ~OCFS2_DIR_ROUND) #define OCFS2_DIR_MIN_REC_LEN OCFS2_DIR_REC_LEN(1) #define OCFS2_LINK_MAX 32000 #define OCFS2_DX_LINK_MAX ((1U << 31) - 1U) #define OCFS2_LINKS_HI_SHIFT 16 #define OCFS2_DX_ENTRIES_MAX (0xffffffffU) /* * Convenience casts */ #define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) /* * Block checking structure. This is used in metadata to validate the * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all * zeros. */ struct ocfs2_block_check { /*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ __le16 bc_ecc; /* Single-error-correction parity vector. This is a simple Hamming code dependent on the blocksize. OCFS2's maximum blocksize, 4K, requires 16 parity bits, so we fit in __le16. */ __le16 bc_reserved1; /*08*/ }; /* * On disk extent record for OCFS2 * It describes a range of clusters on disk. * * Length fields are divided into interior and leaf node versions. * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. */ struct ocfs2_extent_rec { /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ union { __le32 e_int_clusters; /* Clusters covered by all children */ struct { __le16 e_leaf_clusters; /* Clusters covered by this extent */ __u8 e_reserved1; __u8 e_flags; /* Extent flags */ }; }; __le64 e_blkno; /* Physical disk offset, in blocks */ /*10*/ }; struct ocfs2_chain_rec { __le32 c_free; /* Number of free bits in this chain. */ __le32 c_total; /* Number of total bits in this chain */ __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ }; struct ocfs2_truncate_rec { __le32 t_start; /* 1st cluster in this log */ __le32 t_clusters; /* Number of total clusters covered */ }; /* * On disk extent list for OCFS2 (node in the tree). Note that this * is contained inside ocfs2_dinode or ocfs2_extent_block, so the * offsets are relative to ocfs2_dinode.id2.i_list or * ocfs2_extent_block.h_list, respectively. */ struct ocfs2_extent_list { /*00*/ __le16 l_tree_depth; /* Extent tree depth from this point. 0 means data extents hang directly off this header (a leaf) NOTE: The high 8 bits cannot be used - tree_depth is never that big. */ __le16 l_count; /* Number of extent records */ __le16 l_next_free_rec; /* Next unused extent slot */ __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ /*10*/ struct ocfs2_extent_rec l_recs[]; /* Extent records */ }; /* * On disk allocation chain list for OCFS2. Note that this is * contained inside ocfs2_dinode, so the offsets are relative to * ocfs2_dinode.id2.i_chain. */ struct ocfs2_chain_list { /*00*/ __le16 cl_cpg; /* Clusters per Block Group */ __le16 cl_bpc; /* Bits per cluster */ __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; /*10*/ struct ocfs2_chain_rec cl_recs[]; /* Chain records */ }; /* * On disk deallocation log for OCFS2. Note that this is * contained inside ocfs2_dinode, so the offsets are relative to * ocfs2_dinode.id2.i_dealloc. */ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; /*08*/ struct ocfs2_truncate_rec tl_recs[]; /* Truncate records */ }; /* * On disk extent block (indirect block) for OCFS2 */ struct ocfs2_extent_block { /*00*/ __u8 h_signature[8]; /* Signature for verification */ struct ocfs2_block_check h_check; /* Error checking */ /*10*/ __le16 h_suballoc_slot; /* Slot suballocator this extent_header belongs to */ __le16 h_suballoc_bit; /* Bit offset in suballocator block group */ __le32 h_fs_generation; /* Must match super block */ __le64 h_blkno; /* Offset on disk, in blocks */ /*20*/ __le64 h_suballoc_loc; /* Suballocator block group this eb belongs to. Only valid if allocated from a discontiguous block group */ __le64 h_next_leaf_blk; /* Offset on disk, in blocks, of next leaf header pointing to data */ /*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ /* Actual on-disk size is one block */ }; /* * On disk slot map for OCFS2. This defines the contents of the "slot_map" * system file. A slot is valid if it contains a node number >= 0. The * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. */ struct ocfs2_slot_map { /*00*/ DECLARE_FLEX_ARRAY(__le16, sm_slots); /* * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. */ }; struct ocfs2_extended_slot { /*00*/ __u8 es_valid; __u8 es_reserved1[3]; __le32 es_node_num; /*08*/ }; /* * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP * is set. It separates out the valid marker from the node number, and * has room to grow. Unlike the old slot map, this format is defined by * i_size. */ struct ocfs2_slot_map_extended { /*00*/ DECLARE_FLEX_ARRAY(struct ocfs2_extended_slot, se_slots); /* * Actual size is i_size of the slot_map system file. It should * match s_max_slots * sizeof(struct ocfs2_extended_slot) */ }; /* * ci_stackflags is only valid if the incompat bit * OCFS2_FEATURE_INCOMPAT_CLUSTERINFO is set. */ struct ocfs2_cluster_info { /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; union { __le32 ci_reserved; struct { __u8 ci_stackflags; __u8 ci_reserved1; __u8 ci_reserved2; __u8 ci_reserved3; }; }; /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; /*18*/ }; /* * On disk superblock for OCFS2 * Note that it is contained inside an ocfs2_dinode, so all offsets * are relative to the start of ocfs2_dinode.id2. */ struct ocfs2_super_block { /*00*/ __le16 s_major_rev_level; __le16 s_minor_rev_level; __le16 s_mnt_count; __le16 s_max_mnt_count; __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le32 s_checkinterval; /* Max time between checks */ /*10*/ __le64 s_lastcheck; /* Time of last check */ __le32 s_creator_os; /* OS */ __le32 s_feature_compat; /* Compatible feature set */ /*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ __le64 s_root_blkno; /* Offset, in blocks, of root directory dinode */ /*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system directory dinode */ __le32 s_blocksize_bits; /* Blocksize for this fs */ __le32 s_clustersize_bits; /* Clustersize for this fs */ /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts before tunefs required */ __le16 s_tunefs_flag; __le32 s_uuid_hash; /* hash value of uuid */ __le64 s_first_cluster_group; /* Block offset of 1st cluster * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Only valid if either userspace or clusterinfo INCOMPAT flag set. */ /*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size for this fs*/ __le16 s_reserved0; __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. * s_uuid_hash serves as seed[3]. */ /*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ /*140*/ /* * NOTE: As stated above, all offsets are relative to * ocfs2_dinode.id2, which is at 0xC0 in the inode. * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within * our smallest blocksize, which is 512 bytes. To ensure this, * we reserve the space in s_reserved2. Anything past s_reserved2 * will not be available on the smallest blocksize. */ }; /* * Local allocation bitmap for OCFS2 slots * Note that it exists inside an ocfs2_dinode, so all offsets are * relative to the start of ocfs2_dinode.id2. */ struct ocfs2_local_alloc { /*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ __le16 la_size; /* Size of included bitmap, in bytes */ __le16 la_reserved1; __le64 la_reserved2; /*10*/ __u8 la_bitmap[]; }; /* * Data-in-inode header. This is only used if i_dyn_features has * OCFS2_INLINE_DATA_FL set. */ struct ocfs2_inline_data { /*00*/ __le16 id_count; /* Number of bytes that can be used * for data, starting at id_data */ __le16 id_reserved0; __le32 id_reserved1; __u8 id_data[]; /* Start of user data */ }; /* * On disk inode for OCFS2 */ struct ocfs2_dinode { /*00*/ __u8 i_signature[8]; /* Signature for validation */ __le32 i_generation; /* Generation number */ __le16 i_suballoc_slot; /* Slot suballocator this inode belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ /*10*/ __le16 i_links_count_hi; /* High 16 bits of links count */ __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ __le32 i_gid; /* Owning GID */ /*20*/ __le64 i_size; /* Size in bytes */ __le16 i_mode; /* File mode */ __le16 i_links_count; /* Links count */ __le32 i_flags; /* File flags */ /*30*/ __le64 i_atime; /* Access time */ __le64 i_ctime; /* Creation time */ /*40*/ __le64 i_mtime; /* Modification time */ __le64 i_dtime; /* Deletion time */ /*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ __le64 i_last_eb_blk; /* Pointer to last extent block */ /*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ __le32 i_atime_nsec; __le32 i_ctime_nsec; __le32 i_mtime_nsec; /*70*/ __le32 i_attr; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL was set in i_flags */ __le16 i_dyn_features; __le64 i_xattr_loc; /*80*/ struct ocfs2_block_check i_check; /* Error checking */ /*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ /*90*/ __le64 i_refcount_loc; __le64 i_suballoc_loc; /* Suballocator block group this inode belongs to. Only valid if allocated from a discontiguous block group */ /*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ __le16 i_reserved1[3]; __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ struct { __le64 i_rdev; /* Device number */ } dev1; struct { /* Info for bitmap system inodes */ __le32 i_used; /* Bits (ie, clusters) used */ __le32 i_total; /* Total bits (clusters) available */ } bitmap1; struct { /* Info for journal system inodes */ __le32 ij_flags; /* Mounted, version, etc. */ __le32 ij_recovery_generation; /* Incremented when the journal is recovered after an unclean shutdown */ } journal1; } id1; /* Inode type dependent 1 */ /*C0*/ union { struct ocfs2_super_block i_super; struct ocfs2_local_alloc i_lab; struct ocfs2_chain_list i_chain; struct ocfs2_extent_list i_list; struct ocfs2_truncate_log i_dealloc; struct ocfs2_inline_data i_data; DECLARE_FLEX_ARRAY(__u8, i_symlink); } id2; /* Actual on-disk size is one block */ }; /* * On-disk directory entry structure for OCFS2 * * Packed as this structure could be accessed unaligned on 64-bit platforms */ struct ocfs2_dir_entry { /*00*/ __le64 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ /* Actual on-disk length specified by rec_len */ } __attribute__ ((packed)); /* * Per-block record for the unindexed directory btree. This is carefully * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are * mirrored. That way, the directory manipulation code needs a minimal amount * of update. * * NOTE: Keep this structure aligned to a multiple of 4 bytes. */ struct ocfs2_dir_block_trailer { /*00*/ __le64 db_compat_inode; /* Always zero. Was inode */ __le16 db_compat_rec_len; /* Backwards compatible with * ocfs2_dir_entry. */ __u8 db_compat_name_len; /* Always zero. Was name_len */ __u8 db_reserved0; __le16 db_reserved1; __le16 db_free_rec_len; /* Size of largest empty hole * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; /*20*/ __le64 db_free_next; /* Next block in list (unused) */ __le64 db_blkno; /* Offset on disk, in blocks */ /*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; /* * A directory entry in the indexed tree. We don't store the full name here, * but instead provide a pointer to the full dirent in the unindexed tree. * * We also store name_len here so as to reduce the number of leaf blocks we * need to search in case of collisions. */ struct ocfs2_dx_entry { __le32 dx_major_hash; /* Used to find logical * cluster in index */ __le32 dx_minor_hash; /* Lower bits used to find * block in cluster */ __le64 dx_dirent_blk; /* Physical block in unindexed * tree holding this dirent. */ }; struct ocfs2_dx_entry_list { __le32 de_reserved; __le16 de_count; /* Maximum number of entries * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ struct ocfs2_dx_entry de_entries[]; /* Indexed dir entries * in a packed array of * length de_num_used */ }; #define OCFS2_DX_FLAG_INLINE 0x01 /* * A directory indexing block. Each indexed directory has one of these, * pointed to by ocfs2_dinode. * * This block stores an indexed btree root, and a set of free space * start-of-list pointers. */ struct ocfs2_dx_root_block { __u8 dr_signature[8]; /* Signature for verification */ struct ocfs2_block_check dr_check; /* Error checking */ __le16 dr_suballoc_slot; /* Slot suballocator this * block belongs to. */ __le16 dr_suballoc_bit; /* Bit offset in suballocator * block group */ __le32 dr_fs_generation; /* Must match super block */ __le64 dr_blkno; /* Offset on disk, in blocks */ __le64 dr_last_eb_blk; /* Pointer to last * extent block */ __le32 dr_clusters; /* Clusters allocated * to the indexed tree. */ __u8 dr_flags; /* OCFS2_DX_FLAG_* flags */ __u8 dr_reserved0; __le16 dr_reserved1; __le64 dr_dir_blkno; /* Pointer to parent inode */ __le32 dr_num_entries; /* Total number of * names stored in * this directory.*/ __le32 dr_reserved2; __le64 dr_free_blk; /* Pointer to head of free * unindexed block list. */ __le64 dr_suballoc_loc; /* Suballocator block group this root belongs to. Only valid if allocated from a discontiguous block group */ __le64 dr_reserved3[14]; union { struct ocfs2_extent_list dr_list; /* Keep this aligned to 128 * bits for maximum space * efficiency. */ struct ocfs2_dx_entry_list dr_entries; /* In-root-block list of * entries. We grow out * to extents if this * gets too big. */ }; }; /* * The header of a leaf block in the indexed tree. */ struct ocfs2_dx_leaf { __u8 dl_signature[8];/* Signature for verification */ struct ocfs2_block_check dl_check; /* Error checking */ __le64 dl_blkno; /* Offset on disk, in blocks */ __le32 dl_fs_generation;/* Must match super block */ __le32 dl_reserved0; __le64 dl_reserved1; struct ocfs2_dx_entry_list dl_list; }; /* * Largest bitmap for a block (suballocator) group in bytes. This limit * does not affect cluster groups (global allocator). Cluster group * bitmaps run to the end of the block. */ #define OCFS2_MAX_BG_BITMAP_SIZE 256 /* * On disk allocator group structure for OCFS2 */ struct ocfs2_group_desc { /*00*/ __u8 bg_signature[8]; /* Signature for validation */ __le16 bg_size; /* Size of included bitmap in bytes. */ __le16 bg_bits; /* Bits represented by this group. */ __le16 bg_free_bits_count; /* Free bits count */ __le16 bg_chain; /* What chain I am in. */ /*10*/ __le32 bg_generation; __le16 bg_contig_free_bits; /* max contig free bits length */ __le16 bg_reserved1; __le64 bg_next_group; /* Next group in my list, in blocks */ /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in blocks */ __le64 bg_blkno; /* Offset on disk, in blocks */ /*30*/ struct ocfs2_block_check bg_check; /* Error checking */ __le64 bg_reserved2; /*40*/ union { DECLARE_FLEX_ARRAY(__u8, bg_bitmap); struct { /* * Block groups may be discontiguous when * OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG is set. * The extents of a discontiguous block group are * stored in bg_list. It is a flat list. * l_tree_depth must always be zero. A * discontiguous group is signified by a non-zero * bg_list->l_next_free_rec. Only block groups * can be discontiguous; Cluster groups cannot. * We've never made a block group with more than * 2048 blocks (256 bytes of bg_bitmap). This * codifies that limit so that we can fit bg_list. * bg_size of a discontiguous block group will * be 256 to match bg_bitmap_filler. */ __u8 bg_bitmap_filler[OCFS2_MAX_BG_BITMAP_SIZE]; /*140*/ struct ocfs2_extent_list bg_list; }; }; /* Actual on-disk size is one block */ }; struct ocfs2_refcount_rec { /*00*/ __le64 r_cpos; /* Physical offset, in clusters */ __le32 r_clusters; /* Clusters covered by this extent */ __le32 r_refcount; /* Reference count of this extent */ /*10*/ }; #define OCFS2_32BIT_POS_MASK (0xffffffffULL) #define OCFS2_REFCOUNT_LEAF_FL (0x00000001) #define OCFS2_REFCOUNT_TREE_FL (0x00000002) struct ocfs2_refcount_list { /*00*/ __le16 rl_count; /* Maximum number of entries possible in rl_records */ __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ /*10*/ struct ocfs2_refcount_rec rl_recs[]; /* Refcount records */ }; struct ocfs2_refcount_block { /*00*/ __u8 rf_signature[8]; /* Signature for verification */ __le16 rf_suballoc_slot; /* Slot suballocator this block belongs to */ __le16 rf_suballoc_bit; /* Bit offset in suballocator block group */ __le32 rf_fs_generation; /* Must match superblock */ /*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */ __le64 rf_parent; /* Parent block, only valid if OCFS2_REFCOUNT_LEAF_FL is set in rf_flags */ /*20*/ struct ocfs2_block_check rf_check; /* Error checking */ __le64 rf_last_eb_blk; /* Pointer to last extent block */ /*30*/ __le32 rf_count; /* Number of inodes sharing this refcount tree */ __le32 rf_flags; /* See the flags above */ __le32 rf_clusters; /* clusters covered by refcount tree. */ __le32 rf_cpos; /* cluster offset in refcount tree.*/ /*40*/ __le32 rf_generation; /* generation number. all be the same * for the same refcount tree. */ __le32 rf_reserved0; __le64 rf_suballoc_loc; /* Suballocator block group this refcount block belongs to. Only valid if allocated from a discontiguous block group */ /*50*/ __le64 rf_reserved1[6]; /*80*/ union { struct ocfs2_refcount_list rf_records; /* List of refcount records */ struct ocfs2_extent_list rf_list; /* Extent record list, only valid if OCFS2_REFCOUNT_TREE_FL is set in rf_flags */ }; /* Actual on-disk size is one block */ }; /* * On disk extended attribute structure for OCFS2. */ /* * ocfs2_xattr_entry indicates one extend attribute. * * Note that it can be stored in inode, one block or one xattr bucket. */ struct ocfs2_xattr_entry { __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ __le16 xe_name_offset; /* byte offset from the 1st entry in the local xattr storage(inode, xattr block or xattr bucket). */ __u8 xe_name_len; /* xattr name len, doesn't include prefix. */ __u8 xe_type; /* the low 7 bits indicate the name prefix * type and the highest bit indicates whether * the EA is stored in the local storage. */ __le64 xe_value_size; /* real xattr value length. */ }; /* * On disk structure for xattr header. * * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in * the local xattr storage. */ struct ocfs2_xattr_header { __le16 xh_count; /* contains the count of how many records are in the local xattr storage. */ __le16 xh_free_start; /* current offset for storing xattr. */ __le16 xh_name_value_len; /* total length of name/value length in this bucket. */ __le16 xh_num_buckets; /* Number of xattr buckets in this extent record, only valid in the first bucket. */ struct ocfs2_block_check xh_check; /* Error checking (Note, this is only used for xattr buckets. A block uses xb_check and sets this field to zero.) */ struct ocfs2_xattr_entry xh_entries[]; /* xattr entry list. */ }; /* * On disk structure for xattr value root. * * When an xattr's value is large enough, it is stored in an external * b-tree like file data. The xattr value root points to this structure. */ struct ocfs2_xattr_value_root { /*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ __le32 xr_reserved0; __le64 xr_last_eb_blk; /* Pointer to last extent block */ /*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ }; /* * On disk structure for xattr tree root. * * It is used when there are too many extended attributes for one file. These * attributes will be organized and stored in an indexed-btree. */ struct ocfs2_xattr_tree_root { /*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ __le32 xt_reserved0; __le64 xt_last_eb_blk; /* Pointer to last extent block */ /*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ }; #define OCFS2_XATTR_INDEXED 0x1 #define OCFS2_HASH_SHIFT 5 #define OCFS2_XATTR_ROUND 3 #define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ ~(OCFS2_XATTR_ROUND)) #define OCFS2_XATTR_BUCKET_SIZE 4096 #define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ / OCFS2_MIN_BLOCKSIZE) /* * On disk structure for xattr block. */ struct ocfs2_xattr_block { /*00*/ __u8 xb_signature[8]; /* Signature for verification */ __le16 xb_suballoc_slot; /* Slot suballocator this block belongs to. */ __le16 xb_suballoc_bit; /* Bit offset in suballocator block group */ __le32 xb_fs_generation; /* Must match super block */ /*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ struct ocfs2_block_check xb_check; /* Error checking */ /*20*/ __le16 xb_flags; /* Indicates whether this block contains real xattr or a xattr tree. */ __le16 xb_reserved0; __le32 xb_reserved1; __le64 xb_suballoc_loc; /* Suballocator block group this xattr block belongs to. Only valid if allocated from a discontiguous block group */ /*30*/ union { struct ocfs2_xattr_header xb_header; /* xattr header if this block contains xattr */ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this block contains xattr tree. */ } xb_attrs; }; #define OCFS2_XATTR_ENTRY_LOCAL 0x80 #define OCFS2_XATTR_TYPE_MASK 0x7F static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, int local) { if (local) xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; else xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; } static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) { return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; } static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) { xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; } static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) { return xe->xe_type & OCFS2_XATTR_TYPE_MASK; } /* * On disk structures for global quota file */ /* Magic numbers and known versions for global quota files */ #define OCFS2_GLOBAL_QMAGICS {\ 0x0cf52470, /* USRQUOTA */ \ 0x0cf52471 /* GRPQUOTA */ \ } #define OCFS2_GLOBAL_QVERSIONS {\ 0, \ 0, \ } /* Each block of each quota file has a certain fixed number of bytes reserved * for OCFS2 internal use at its end. OCFS2 can use it for things like * checksums, etc. */ #define OCFS2_QBLK_RESERVED_SPACE 8 /* Generic header of all quota files */ struct ocfs2_disk_dqheader { __le32 dqh_magic; /* Magic number identifying file */ __le32 dqh_version; /* Quota format version */ }; #define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) /* Information header of global quota file (immediately follows the generic * header) */ struct ocfs2_global_disk_dqinfo { /*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */ __le32 dqi_igrace; /* Grace time for inode softlimit excess */ __le32 dqi_syncms; /* Time after which we sync local changes to * global quota file */ __le32 dqi_blocks; /* Number of blocks in quota file */ /*10*/ __le32 dqi_free_blk; /* First free block in quota file */ __le32 dqi_free_entry; /* First block with free dquot entry in quota * file */ }; /* Structure with global user / group information. We reserve some space * for future use. */ struct ocfs2_global_disk_dqblk { /*00*/ __le32 dqb_id; /* ID the structure belongs to */ __le32 dqb_use_count; /* Number of nodes having reference to this structure */ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ /*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */ __le64 dqb_curinodes; /* current # allocated inodes */ /*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */ __le64 dqb_bsoftlimit; /* preferred limit on disk space */ /*30*/ __le64 dqb_curspace; /* current space occupied */ __le64 dqb_btime; /* time limit for excessive disk use */ /*40*/ __le64 dqb_itime; /* time limit for excessive inode use */ __le64 dqb_pad1; /*50*/ __le64 dqb_pad2; }; /* * On-disk structures for local quota file */ /* Magic numbers and known versions for local quota files */ #define OCFS2_LOCAL_QMAGICS {\ 0x0cf524c0, /* USRQUOTA */ \ 0x0cf524c1 /* GRPQUOTA */ \ } #define OCFS2_LOCAL_QVERSIONS {\ 0, \ 0, \ } /* Quota flags in dqinfo header */ #define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\ * quota has been cleanly turned off) */ #define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) /* Information header of local quota file (immediately follows the generic * header) */ struct ocfs2_local_disk_dqinfo { __le32 dqi_flags; /* Flags for quota file */ __le32 dqi_chunks; /* Number of chunks of quota structures * with a bitmap */ __le32 dqi_blocks; /* Number of blocks allocated for quota file */ }; /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; /* One entry in local quota file */ struct ocfs2_local_disk_dqblk { /*00*/ __le64 dqb_id; /* id this quota applies to */ __le64 dqb_spacemod; /* Change in the amount of used space */ /*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */ }; /* * The quota trailer lives at the end of each quota block. */ struct ocfs2_disk_dqtrailer { /*00*/ struct ocfs2_block_check dq_check; /* Error checking */ /*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */ }; static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize, void *buf) { char *ptr = buf; ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE; return (struct ocfs2_disk_dqtrailer *)ptr; } #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); } static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, struct ocfs2_dinode *di) { unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data) - xattrsize; else return sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); } static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_inode_with_xattr( struct super_block *sb, struct ocfs2_dinode *di) { int size; unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - xattrsize; else size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_dx_root(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_root_block, dr_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); return size / sizeof(struct ocfs2_chain_rec); } static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_extent_block, h_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_extent_recs_per_gd(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_dx_entries_per_leaf(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_leaf, dl_list.de_entries); return size / sizeof(struct ocfs2_dx_entry); } static inline int ocfs2_dx_entries_per_root(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dx_root_block, dr_entries.de_entries); return size / sizeof(struct ocfs2_dx_entry); } static inline u16 ocfs2_local_alloc_size(struct super_block *sb) { u16 size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); return size; } static inline int ocfs2_group_bitmap_size(struct super_block *sb, int suballocator, u32 feature_incompat) { int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); /* * The cluster allocator uses the entire block. Suballocators have * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older * code expects bg_size set to the maximum. Thus we must keep * bg_size as-is unless discontig_bg is enabled. */ if (suballocator && (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) size = OCFS2_MAX_BG_BITMAP_SIZE; return size; } static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); return size / sizeof(struct ocfs2_truncate_rec); } static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) { u64 offset = OCFS2_BACKUP_SB_START; if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { offset <<= (2 * index); offset >>= sb->s_blocksize_bits; return offset; } return 0; } static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_xattr_block, xb_attrs.xb_root.xt_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb) { int size; size = sb->s_blocksize - offsetof(struct ocfs2_refcount_block, rf_records.rl_recs); return size / sizeof(struct ocfs2_refcount_rec); } static inline u32 ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec) { return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; } #else static inline int ocfs2_fast_symlink_chars(int blocksize) { return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); } static inline int ocfs2_max_inline_data_with_xattr(int blocksize, struct ocfs2_dinode *di) { if (di && (di->i_dyn_features & OCFS2_INLINE_XATTR_FL)) return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data) - di->i_xattr_inline_size; else return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data); } static inline int ocfs2_extent_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_chain_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); return size / sizeof(struct ocfs2_chain_rec); } static inline int ocfs2_extent_recs_per_eb(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_extent_block, h_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_extent_recs_per_gd(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_group_desc, bg_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } static inline int ocfs2_local_alloc_size(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); return size; } static inline int ocfs2_group_bitmap_size(int blocksize, int suballocator, uint32_t feature_incompat) { int size = sb->s_blocksize - offsetof(struct ocfs2_group_desc, bg_bitmap); /* * The cluster allocator uses the entire block. Suballocators have * never used more than OCFS2_MAX_BG_BITMAP_SIZE. Unfortunately, older * code expects bg_size set to the maximum. Thus we must keep * bg_size as-is unless discontig_bg is enabled. */ if (suballocator && (feature_incompat & OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG)) size = OCFS2_MAX_BG_BITMAP_SIZE; return size; } static inline int ocfs2_truncate_recs_per_inode(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); return size / sizeof(struct ocfs2_truncate_rec); } static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) { uint64_t offset = OCFS2_BACKUP_SB_START; if (index >= 0 && index < OCFS2_MAX_BACKUP_SUPERBLOCKS) { offset <<= (2 * index); offset /= blocksize; return offset; } return 0; } static inline int ocfs2_xattr_recs_per_xb(int blocksize) { int size; size = blocksize - offsetof(struct ocfs2_xattr_block, xb_attrs.xb_root.xt_list.l_recs); return size / sizeof(struct ocfs2_extent_rec); } #endif /* __KERNEL__ */ static inline int ocfs2_system_inode_is_global(int type) { return ((type >= 0) && (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); } static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, int type, int slot) { int chars; /* * Global system inodes can only have one copy. Everything * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode * list has a copy per slot. */ if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) chars = snprintf(buf, len, "%s", ocfs2_system_inodes[type].si_name); else chars = snprintf(buf, len, ocfs2_system_inodes[type].si_name, slot); return chars; } static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, umode_t mode) { de->file_type = fs_umode_to_ftype(mode); } static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) { if ((offsetof(struct ocfs2_group_desc, bg_bitmap) + le16_to_cpu(gd->bg_size)) != offsetof(struct ocfs2_group_desc, bg_list)) return 0; /* * Only valid to check l_next_free_rec if * bg_bitmap + bg_size == bg_list. */ if (!gd->bg_list.l_next_free_rec) return 0; return 1; } #endif /* _OCFS2_FS_H */
40 39 1 40 39 40 40 40 194 151 3 40 159 165 8 8 8 8 3 2 2 1 1 9 9 4 1 3 3 3 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-only /* * Pid namespaces * * Authors: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/pid.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/syscalls.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/acct.h> #include <linux/slab.h> #include <linux/proc_ns.h> #include <linux/reboot.h> #include <linux/export.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> #include <uapi/linux/wait.h> #include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; /* * creates the kmem cache to allocate pids from. * @level: pid namespace level */ static struct kmem_cache *create_pid_cachep(unsigned int level) { /* Level 0 is init_pid_ns.pid_cachep */ struct kmem_cache **pkc = &pid_cache[level - 1]; struct kmem_cache *kc; char name[4 + 10 + 1]; unsigned int len; kc = READ_ONCE(*pkc); if (kc) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); len = struct_size_t(struct pid, numbers, level + 1); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) *pkc = kmem_cache_create(name, len, 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); mutex_unlock(&pid_caches_mutex); /* current can fail, but someone else can succeed. */ return READ_ONCE(*pkc); } static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); } static void dec_pid_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); } static void destroy_pid_namespace_work(struct work_struct *work); static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; int err; err = -EINVAL; if (!in_userns(parent_pid_ns->user_ns, user_ns)) goto out; err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; ucounts = inc_pid_namespaces(user_ns); if (!ucounts) goto out; err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out_dec; idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level); if (ns->pid_cachep == NULL) goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) goto out_free_idr; ns->ns.ops = &pidns_operations; ns->pid_max = parent_pid_ns->pid_max; err = register_pidns_sysctls(ns); if (err) goto out_free_inum; refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; INIT_WORK(&ns->work, destroy_pid_namespace_work); #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); #endif return ns; out_free_inum: ns_free_inum(&ns->ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); out: return ERR_PTR(err); } static void delayed_free_pidns(struct rcu_head *p) { struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu); dec_pid_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kmem_cache_free(pid_ns_cachep, ns); } static void destroy_pid_namespace(struct pid_namespace *ns) { unregister_pidns_sysctls(ns); ns_free_inum(&ns->ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } static void destroy_pid_namespace_work(struct work_struct *work) { struct pid_namespace *ns = container_of(work, struct pid_namespace, work); do { struct pid_namespace *parent; parent = ns->parent; destroy_pid_namespace(ns); ns = parent; } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); } struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); } void put_pid_ns(struct pid_namespace *ns) { if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { int nr; int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); /* * Ignore SIGCHLD causing any terminated children to autoreap. * This speeds up the namespace shutdown, plus see the comment * below. */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; spin_unlock_irq(&me->sighand->siglock); /* * The last thread in the cgroup-init thread group is terminating. * Find remaining pid_ts in the namespace, signal and wait for them * to exit. * * Note: This signals each threads in the namespace - even those that * belong to the same thread group, To avoid this, we would have * to walk the entire tasklist looking a processes in this * namespace, but that could be unnecessarily expensive if the * pid namespace has just a few processes. Or we need to * maintain a tasklist for each pid namespace. * */ rcu_read_lock(); read_lock(&tasklist_lock); nr = 2; idr_for_each_entry_continue(&pid_ns->idr, pid, nr) { task = pid_task(pid, PIDTYPE_PID); if (task && !__fatal_signal_pending(task)) group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); } read_unlock(&tasklist_lock); rcu_read_unlock(); /* * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. * kernel_wait4() will also block until our children traced from the * parent namespace are detached and become EXIT_DEAD. */ do { clear_thread_flag(TIF_SIGPENDING); clear_thread_flag(TIF_NOTIFY_SIGNAL); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE * process whose parents processes are outside of the pid * namespace. Such processes are created with setns()+fork(). * * If those EXIT_ZOMBIE processes are not reaped by their * parents before their parents exit, they will be reparented * to pid_ns->child_reaper. Thus pidns->child_reaper needs to * stay valid until they all go away. * * The code relies on the pid_ns->child_reaper ignoring * SIGCHILD to cause those EXIT_ZOMBIE processes to be * autoreaped if reparented. * * Semantically it is also desirable to wait for EXIT_ZOMBIE * processes before allowing the child_reaper to be reaped, as * that gives the invariant that when the init process of a * pid namespace is reaped all of the processes in the pid * namespace are gone. * * Once all of the other tasks are gone from the pid_namespace * free_pid() will awaken this task. */ for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->pid_allocated == init_pids) break; schedule(); } __set_current_state(TASK_RUNNING); if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; acct_exit_ns(pid_ns); return; } #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; int ret, next; if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; tmp.extra2 = &pid_ns->pid_max; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); return ret; } static const struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, .extra2 = &init_pid_ns.pid_max, }, }; #endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { if (pid_ns == &init_pid_ns) return 0; switch (cmd) { case LINUX_REBOOT_CMD_RESTART2: case LINUX_REBOOT_CMD_RESTART: pid_ns->reboot = SIGHUP; break; case LINUX_REBOOT_CMD_POWER_OFF: case LINUX_REBOOT_CMD_HALT: pid_ns->reboot = SIGINT; break; default: return -EINVAL; } read_lock(&tasklist_lock); send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); /* Not reached */ return 0; } static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) { return container_of(ns, struct pid_namespace, ns); } static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(task); if (ns) get_pid_ns(ns); rcu_read_unlock(); return ns ? &ns->ns : NULL; } static struct ns_common *pidns_for_children_get(struct task_struct *task) { struct pid_namespace *ns = NULL; task_lock(task); if (task->nsproxy) { ns = task->nsproxy->pid_ns_for_children; get_pid_ns(ns); } task_unlock(task); if (ns) { read_lock(&tasklist_lock); if (!ns->child_reaper) { put_pid_ns(ns); ns = NULL; } read_unlock(&tasklist_lock); } return ns ? &ns->ns : NULL; } static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); } static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* * Only allow entering the current active pid namespace * or a child of the current active pid namespace. * * This is required for fork to return a usable pid value and * this maintains the property that processes and their * children can not escape their current pid namespace. */ if (new->level < active->level) return -EINVAL; ancestor = new; while (ancestor->level > active->level) ancestor = ancestor->parent; if (ancestor != active) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); nsproxy->pid_ns_for_children = get_pid_ns(new); return 0; } static struct ns_common *pidns_get_parent(struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *pid_ns, *p; /* See if the parent is in the current namespace */ pid_ns = p = to_pid_ns(ns)->parent; for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == active) break; p = p->parent; } return &get_pid_ns(pid_ns)->ns; } static struct user_namespace *pidns_owner(struct ns_common *ns) { return to_pid_ns(ns)->user_ns; } const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, .owner = pidns_owner, .get_parent = pidns_get_parent, }; static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE register_sysctl_init("kernel", pid_ns_ctl_table); #endif register_pid_ns_sysctl_table_vm(); return 0; } __initcall(pid_namespaces_init);
44 39 14 51 39 34 101 46 44 26 32 144 145 145 8 39 39 46 39 30 87 199 143 148 2 5 5 1 1 5 5 46 39 46 7 1 7 208 10 6 145 195 7 50 2 49 3 48 49 42 48 48 3 33 35 1 2 33 3 3 35 35 35 20 1 20 1 22 21 1 5 17 24 2 5 101 101 6 6 5 6 5 7 7 6 6 1 5 9 9 5 3 3 1 2 10 1 9 10 3 7 6 48 47 4 45 4 1 10 10 2 36 2 34 33 3 32 30 1 32 8 1 8 1 5 3 1 2 2 2 3 1 1 1 1 4 4 1 1 2 6 3 3 5 1 11 1 10 48 14 1 13 3 2 12 12 12 2 2 2 139 139 127 140 1 139 21 1 12 9 9 9 5 1 1 1 4 21 27 2 27 7 21 14 7 88 88 88 88 88 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2011 Novell Inc. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/rbtree.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/ratelimit.h> #include "overlayfs.h" struct ovl_cache_entry { unsigned int len; unsigned int type; u64 real_ino; u64 ino; struct list_head l_node; struct rb_node node; struct ovl_cache_entry *next_maybe_whiteout; bool is_upper; bool is_whiteout; bool check_xwhiteout; char name[]; }; struct ovl_dir_cache { long refcount; u64 version; struct list_head entries; struct rb_root root; }; struct ovl_readdir_data { struct dir_context ctx; struct dentry *dentry; bool is_lowest; struct rb_root *root; struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; int count; int err; bool is_upper; bool d_type_supported; bool in_xwhiteouts_dir; }; struct ovl_dir_file { bool is_real; bool is_upper; struct ovl_dir_cache *cache; struct list_head *cursor; struct file *realfile; struct file *upperfile; }; static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) { return rb_entry(n, struct ovl_cache_entry, node); } static bool ovl_cache_entry_find_link(const char *name, int len, struct rb_node ***link, struct rb_node **parent) { bool found = false; struct rb_node **newp = *link; while (!found && *newp) { int cmp; struct ovl_cache_entry *tmp; *parent = *newp; tmp = ovl_cache_entry_from_node(*newp); cmp = strncmp(name, tmp->name, len); if (cmp > 0) newp = &tmp->node.rb_right; else if (cmp < 0 || len < tmp->len) newp = &tmp->node.rb_left; else found = true; } *link = newp; return found; } static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, const char *name, int len) { struct rb_node *node = root->rb_node; int cmp; while (node) { struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); cmp = strncmp(name, p->name, len); if (cmp > 0) node = p->node.rb_right; else if (cmp < 0 || len < p->len) node = p->node.rb_left; else return p; } return NULL; } static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, struct ovl_cache_entry *p) { /* Don't care if not doing ovl_iter() */ if (!rdd->dentry) return false; /* Always recalc d_ino when remapping lower inode numbers */ if (ovl_xino_bits(OVL_FS(rdd->dentry->d_sb))) return true; /* Always recalc d_ino for parent */ if (strcmp(p->name, "..") == 0) return true; /* If this is lower, then native d_ino will do */ if (!rdd->is_upper) return false; /* * Recalc d_ino for '.' and for all entries if dir is impure (contains * copied up entries) */ if ((p->name[0] == '.' && p->len == 1) || ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) return true; return false; } static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); p = kmalloc(size, GFP_KERNEL); if (!p) return NULL; memcpy(p->name, name, len); p->name[len] = '\0'; p->len = len; p->type = d_type; p->real_ino = ino; p->ino = ino; /* Defer setting d_ino for upper entry to ovl_iterate() */ if (ovl_calc_d_ino(rdd, p)) p->ino = 0; p->is_upper = rdd->is_upper; p->is_whiteout = false; /* Defer check for overlay.whiteout to ovl_iterate() */ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p; } return p; } static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; if (ovl_cache_entry_find_link(name, len, &newp, &parent)) return true; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); return true; } static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(rdd->root, name, namelen); if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else list_add_tail(&p->l_node, &rdd->middle); } return rdd->err == 0; } void ovl_cache_free(struct list_head *list) { struct ovl_cache_entry *p; struct ovl_cache_entry *n; list_for_each_entry_safe(p, n, list, l_node) kfree(p); INIT_LIST_HEAD(list); } void ovl_dir_cache_free(struct inode *inode) { struct ovl_dir_cache *cache = ovl_dir_cache(inode); if (cache) { ovl_cache_free(&cache->entries); kfree(cache); } } static void ovl_cache_put(struct ovl_dir_file *od, struct inode *inode) { struct ovl_dir_cache *cache = od->cache; WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { if (ovl_dir_cache(inode) == cache) ovl_set_dir_cache(inode, NULL); ovl_cache_free(&cache->entries); kfree(cache); } } static bool ovl_fill_merge(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; if (!rdd->is_lowest) return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); else return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); } static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err; struct ovl_cache_entry *p; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); } } inode_unlock(dir->d_inode); } ovl_revert_creds(old_cred); return err; } static inline int ovl_dir_read(const struct path *realpath, struct ovl_readdir_data *rdd) { struct file *realfile; int err; realfile = ovl_path_open(realpath, O_RDONLY | O_LARGEFILE); if (IS_ERR(realfile)) return PTR_ERR(realfile); rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { rdd->count = 0; rdd->err = 0; err = iterate_dir(realfile, &rdd->ctx); if (err >= 0) err = rdd->err; } while (!err && rdd->count); if (!err && rdd->first_maybe_whiteout && rdd->dentry) err = ovl_check_whiteouts(realpath, rdd); fput(realfile); return err; } static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; struct inode *inode = file_inode(file); bool is_real; if (cache && ovl_inode_version_get(inode) != cache->version) { ovl_cache_put(od, inode); od->cache = NULL; od->cursor = NULL; } is_real = ovl_dir_is_real(inode); if (od->is_real != is_real) { /* is_real can only become false when dir is copied up */ if (WARN_ON(is_real)) return; od->is_real = false; } } static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = dentry, .list = list, .root = root, .is_lowest = false, }; int idx, next; const struct ovl_layer *layer; for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath, &layer); rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && ovl_dentry_has_xwhiteouts(dentry); if (next != -1) { err = ovl_dir_read(&realpath, &rdd); if (err) break; } else { /* * Insert lowest layer entries before upper ones, this * allows offsets to be reasonably constant */ list_add(&rdd.middle, rdd.list); rdd.is_lowest = true; err = ovl_dir_read(&realpath, &rdd); list_del(&rdd.middle); } } return err; } static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) { struct list_head *p; loff_t off = 0; list_for_each(p, &od->cache->entries) { if (off >= pos) break; off++; } /* Cursor is safe since the cache is stable */ od->cursor = p; } static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; struct ovl_dir_cache *cache; struct inode *inode = d_inode(dentry); cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) { WARN_ON(!cache->refcount); cache->refcount++; return cache; } ovl_set_dir_cache(d_inode(dentry), NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); cache->root = RB_ROOT; res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } /* Map inode number to lower fs unique range */ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, const char *name, int namelen, bool warn) { unsigned int xinoshift = 64 - xinobits; if (unlikely(ino >> xinoshift)) { if (warn) { pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", namelen, name, ino, xinobits); } return ino; } /* * The lowest xinobit is reserved for mapping the non-peresistent inode * numbers range, but this range is only exposed via st_ino, not here. */ return ino | ((u64)fsid) << (xinoshift + 1); } /* * Set d_ino for upper entries if needed. Non-upper entries should always report * the uppermost real inode ino and should not call this function. * * When not all layer are on same fs, report real ino also for upper. * * When all layers are on the same fs, and upper has a reference to * copy up origin, call vfs_getattr() on the overlay entry to make * sure that d_ino will be consistent with st_ino from stat(2). * * Also checks the overlay.whiteout xattr by doing a full lookup which will return * negative in this case. */ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, bool update_ino) { struct dentry *dir = path->dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); struct dentry *this = NULL; enum ovl_path_type type; u64 ino = p->real_ino; int xinobits = ovl_xino_bits(ofs); int err = 0; if (!ovl_same_dev(ofs) && !p->check_xwhiteout) goto out; if (p->name[0] == '.') { if (p->len == 1) { this = dget(dir); goto get; } if (p->len == 2 && p->name[1] == '.') { /* we shall not be moved */ this = dget(dir->d_parent); goto get; } } /* This checks also for xwhiteouts */ this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; goto fail; } goto out; } get: if (!ovl_same_dev(ofs) || !update_ino) goto out; type = ovl_path_type(this); if (OVL_TYPE_ORIGIN(type)) { struct kstat stat; struct path statpath = *path; statpath.dentry = this; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) goto fail; /* * Directory inode is always on overlay st_dev. * Non-dir with ovl_same_dev() could be on pseudo st_dev in case * of xino bits overflow. */ WARN_ON_ONCE(S_ISDIR(stat.mode) && dir->d_sb->s_dev != stat.dev); ino = stat.ino; } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, ovl_layer_lower(this)->fsid, p->name, p->len, ovl_xino_warn(ofs)); } out: p->ino = ino; dput(this); return err; fail: pr_warn_ratelimited("failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } static bool ovl_fill_plain(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; } list_add_tail(&p->l_node, rdd->list); return true; } static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct rb_root *root) { int err; struct path realpath; struct ovl_cache_entry *p, *n; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .list = list, .root = root, }; INIT_LIST_HEAD(list); *root = RB_ROOT; ovl_path_upper(path->dentry, &realpath); err = ovl_dir_read(&realpath, &rdd); if (err) return err; list_for_each_entry_safe(p, n, list, l_node) { if (strcmp(p->name, ".") != 0 && strcmp(p->name, "..") != 0) { err = ovl_cache_update(path, p, true); if (err) return err; } if (p->ino == p->real_ino) { list_del(&p->l_node); kfree(p); } else { struct rb_node **newp = &root->rb_node; struct rb_node *parent = NULL; if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, &newp, &parent))) return -EIO; rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, root); } } return 0; } static struct ovl_dir_cache *ovl_cache_get_impure(const struct path *path) { int res; struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_dir_cache *cache; cache = ovl_dir_cache(inode); if (cache && ovl_inode_version_get(inode) == cache->version) return cache; /* Impure cache is not refcounted, free it here */ ovl_dir_cache_free(inode); ovl_set_dir_cache(inode, NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); res = ovl_dir_read_impure(path, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } if (list_empty(&cache->entries)) { /* * A good opportunity to get rid of an unneeded "impure" flag. * Removing the "impure" xattr is best effort. */ if (!ovl_want_write(dentry)) { ovl_removexattr(ofs, ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); ovl_drop_write(dentry); } ovl_clear_flag(OVL_IMPURE, inode); kfree(cache); return NULL; } cache->version = ovl_inode_version_get(inode); ovl_set_dir_cache(inode, cache); return cache; } struct ovl_readdir_translate { struct dir_context *orig_ctx; struct ovl_dir_cache *cache; struct dir_context ctx; u64 parent_ino; int fsid; int xinobits; bool xinowarn; }; static bool ovl_fill_real(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_translate *rdt = container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; } else if (rdt->cache) { struct ovl_cache_entry *p; p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); if (p) ino = p->ino; } else if (rdt->xinobits) { ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, name, namelen, rdt->xinowarn); } return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); } static bool ovl_is_impure_dir(struct file *file) { struct ovl_dir_file *od = file->private_data; struct inode *dir = file_inode(file); /* * Only upper dir can be impure, but if we are in the middle of * iterating a lower real dir, dir could be copied up and marked * impure. We only want the impure cache if we started iterating * a real upper dir to begin with. */ return od->is_upper && ovl_test_flag(OVL_IMPURE, dir); } static int ovl_iterate_real(struct file *file, struct dir_context *ctx) { int err; struct ovl_dir_file *od = file->private_data; struct dentry *dir = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dir->d_sb); const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, .orig_ctx = ctx, .xinobits = ovl_xino_bits(ofs), .xinowarn = ovl_xino_warn(ofs), }; if (rdt.xinobits && lower_layer) rdt.fsid = lower_layer->fsid; if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { struct kstat stat; struct path statpath = file->f_path; statpath.dentry = dir->d_parent; err = vfs_getattr(&statpath, &stat, STATX_INO, 0); if (err) return err; WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); rdt.parent_ino = stat.ino; } if (ovl_is_impure_dir(file)) { rdt.cache = ovl_cache_get_impure(&file->f_path); if (IS_ERR(rdt.cache)) return PTR_ERR(rdt.cache); } err = iterate_dir(od->realfile, &rdt.ctx); ctx->pos = rdt.ctx.pos; return err; } static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_cache_entry *p; const struct cred *old_cred; int err; old_cred = ovl_override_creds(dentry->d_sb); if (!ctx->pos) ovl_dir_reset(file); if (od->is_real) { /* * If parent is merge, then need to adjust d_ino for '..', if * dir is impure then need to adjust d_ino for copied up * entries. */ if (ovl_xino_bits(ofs) || (ovl_same_fs(ofs) && (ovl_is_impure_dir(file) || OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent))))) { err = ovl_iterate_real(file, ctx); } else { err = iterate_dir(od->realfile, ctx); } goto out; } if (!od->cache) { struct ovl_dir_cache *cache; cache = ovl_cache_get(dentry); err = PTR_ERR(cache); if (IS_ERR(cache)) goto out; od->cache = cache; ovl_seek_cursor(od, ctx->pos); } while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); if (!p->is_whiteout) { if (!p->ino || p->check_xwhiteout) { err = ovl_cache_update(&file->f_path, p, !p->ino); if (err) goto out; } } /* ovl_cache_update() sets is_whiteout on stale entry */ if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; } od->cursor = p->l_node.next; ctx->pos++; } err = 0; out: ovl_revert_creds(old_cred); return err; } static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) { loff_t res; struct ovl_dir_file *od = file->private_data; inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); if (od->is_real) { res = vfs_llseek(od->realfile, offset, origin); file->f_pos = od->realfile->f_pos; } else { res = -EINVAL; switch (origin) { case SEEK_CUR: offset += file->f_pos; break; case SEEK_SET: break; default: goto out_unlock; } if (offset < 0) goto out_unlock; if (offset != file->f_pos) { file->f_pos = offset; if (od->cache) ovl_seek_cursor(od, offset); } res = offset; } out_unlock: inode_unlock(file_inode(file)); return res; } static struct file *ovl_dir_open_realfile(const struct file *file, const struct path *realpath) { struct file *res; const struct cred *old_cred; old_cred = ovl_override_creds(file_inode(file)->i_sb); res = ovl_path_open(realpath, O_RDONLY | (file->f_flags & O_LARGEFILE)); ovl_revert_creds(old_cred); return res; } /* * Like ovl_real_fdget(), returns upperfile if dir was copied up since open. * Unlike ovl_real_fdget(), this caches upperfile in file->private_data. * * TODO: use same abstract type for file->private_data of dir and file so * upperfile could also be cached for files as well. */ struct file *ovl_dir_real_file(const struct file *file, bool want_upper) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct file *old, *realfile = od->realfile; if (!OVL_TYPE_UPPER(ovl_path_type(dentry))) return want_upper ? NULL : realfile; /* * Need to check if we started out being a lower dir, but got copied up */ if (!od->is_upper) { realfile = READ_ONCE(od->upperfile); if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_dir_open_realfile(file, &upperpath); if (IS_ERR(realfile)) return realfile; old = cmpxchg_release(&od->upperfile, NULL, realfile); if (old) { fput(realfile); realfile = old; } } } return realfile; } static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct file *realfile; int err; err = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (err <= 0) return err; realfile = ovl_dir_real_file(file, true); err = PTR_ERR_OR_ZERO(realfile); /* Nothing to sync for lower */ if (!realfile || err) return err; return vfs_fsync_range(realfile, start, end, datasync); } static int ovl_dir_release(struct inode *inode, struct file *file) { struct ovl_dir_file *od = file->private_data; if (od->cache) { inode_lock(inode); ovl_cache_put(od, inode); inode_unlock(inode); } fput(od->realfile); if (od->upperfile) fput(od->upperfile); kfree(od); return 0; } static int ovl_dir_open(struct inode *inode, struct file *file) { struct path realpath; struct file *realfile; struct ovl_dir_file *od; enum ovl_path_type type; od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); if (!od) return -ENOMEM; type = ovl_path_real(file->f_path.dentry, &realpath); realfile = ovl_dir_open_realfile(file, &realpath); if (IS_ERR(realfile)) { kfree(od); return PTR_ERR(realfile); } od->realfile = realfile; od->is_real = ovl_dir_is_real(inode); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; return 0; } WRAP_DIR_ITER(ovl_iterate) // FIXME! const struct file_operations ovl_dir_operations = { .read = generic_read_dir, .open = ovl_dir_open, .iterate_shared = shared_ovl_iterate, .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, }; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); err = ovl_dir_read_merged(dentry, list, &root); ovl_revert_creds(old_cred); if (err) return err; err = 0; list_for_each_entry_safe(p, n, list, l_node) { /* * Select whiteouts in upperdir, they should * be cleared when deleting this directory. */ if (p->is_whiteout) { if (p->is_upper) continue; goto del_entry; } if (p->name[0] == '.') { if (p->len == 1) goto del_entry; if (p->len == 2 && p->name[1] == '.') goto del_entry; } err = -ENOTEMPTY; break; del_entry: list_del(&p->l_node); kfree(p); } return err; } void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; dentry = ovl_lookup_upper(ofs, p->name, upper, p->len); if (IS_ERR(dentry)) { pr_err("lookup '%s/%.*s' failed (%i)\n", upper->d_name.name, p->len, p->name, (int) PTR_ERR(dentry)); continue; } if (dentry->d_inode) ovl_cleanup(ofs, upper->d_inode, dentry); dput(dentry); } inode_unlock(upper->d_inode); } static bool ovl_check_d_type(struct dir_context *ctx, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); /* Even if d_type is not supported, DT_DIR is returned for . and .. */ if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen)) return true; if (d_type != DT_UNKNOWN) rdd->d_type_supported = true; return true; } /* * Returns 1 if d_type is supported, 0 not supported/unknown. Negative values * if error is encountered. */ int ovl_check_d_type_supported(const struct path *realpath) { int err; struct ovl_readdir_data rdd = { .ctx.actor = ovl_check_d_type, .d_type_supported = false, }; err = ovl_dir_read(realpath, &rdd); if (err) return err; return rdd.d_type_supported; } #define OVL_INCOMPATDIR_NAME "incompat" static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *path, int level) { int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .list = &list, }; bool incompat = false; /* * The "work/incompat" directory is treated specially - if it is not * empty, instead of printing a generic error and mounting read-only, * we will error about incompat features and fail the mount. * * When called from ovl_indexdir_cleanup(), path->dentry->d_name.name * starts with '#'. */ if (level == 2 && !strcmp(path->dentry->d_name.name, OVL_INCOMPATDIR_NAME)) incompat = true; err = ovl_dir_read(path, &rdd); if (err) goto out; inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { struct dentry *dentry; if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } else if (incompat) { pr_err("overlay with incompat feature '%s' cannot be mounted\n", p->name); err = -EINVAL; break; } dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len); if (IS_ERR(dentry)) continue; if (dentry->d_inode) err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level); dput(dentry); if (err) break; } inode_unlock(dir); out: ovl_cache_free(&list); return err; } int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level) { int err; if (!d_is_dir(dentry) || level > 1) { return ovl_cleanup(ofs, dir, dentry); } err = ovl_do_rmdir(ofs, dir, dentry); if (err) { struct path path = { .mnt = mnt, .dentry = dentry }; inode_unlock(dir); err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1); inode_lock_nested(dir, I_MUTEX_PARENT); if (!err) err = ovl_cleanup(ofs, dir, dentry); } return err; } int ovl_indexdir_cleanup(struct ovl_fs *ofs) { int err; struct dentry *indexdir = ofs->workdir; struct dentry *index = NULL; struct inode *dir = indexdir->d_inode; struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir }; LIST_HEAD(list); struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, .list = &list, }; err = ovl_dir_read(&path, &rdd); if (err) goto out; inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { if (p->name[0] == '.') { if (p->len == 1) continue; if (p->len == 2 && p->name[1] == '.') continue; } index = ovl_lookup_upper(ofs, p->name, indexdir, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; break; } /* Cleanup leftover from index create/cleanup attempt */ if (index->d_name.name[0] == '#') { err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1); if (err) break; goto next; } err = ovl_verify_index(ofs, index); if (!err) { goto next; } else if (err == -ESTALE) { /* Cleanup stale index entries */ err = ovl_cleanup(ofs, dir, index); } else if (err != -ENOENT) { /* * Abort mount to avoid corrupting the index if * an incompatible index entry was found or on out * of memory. */ break; } else if (ofs->config.nfs_export) { /* * Whiteout orphan index to block future open by * handle after overlay nlink dropped to zero. */ err = ovl_cleanup_and_whiteout(ofs, dir, index); } else { /* Cleanup orphan index entries */ err = ovl_cleanup(ofs, dir, index); } if (err) break; next: dput(index); index = NULL; } dput(index); inode_unlock(dir); out: ovl_cache_free(&list); if (err) pr_err("failed index dir cleanup (%i)\n", err); return err; }
3 15 6 9 6 8 1 14 4 3 7 2 1 4 1 1 1 1 1 7 7 7 7 3 1 8 8 6 2 8 8 3 4 2 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static int get_ifindex(const struct net_device *dev) { return dev ? dev->ifindex : 0; } static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, const struct nft_pktinfo *pkt, const struct net_device *dev, struct ipv6hdr *iph) { int lookup_flags = 0; if (priv->flags & NFTA_FIB_F_DADDR) { fl6->daddr = iph->daddr; fl6->saddr = iph->saddr; } else { if (nft_hook(pkt) == NF_INET_FORWARD && priv->flags & NFTA_FIB_F_IIF) fl6->flowi6_iif = nft_out(pkt)->ifindex; fl6->daddr = iph->saddr; fl6->saddr = iph->daddr; } if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); } if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) lookup_flags |= RT6_LOOKUP_F_HAS_SADDR; if (priv->flags & NFTA_FIB_F_MARK) fl6->flowi6_mark = pkt->skb->mark; fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; return lookup_flags; } static u32 __nft_fib6_eval_type(const struct nft_fib *priv, const struct nft_pktinfo *pkt, struct ipv6hdr *iph) { const struct net_device *dev = NULL; int route_err, addrtype; struct rt6_info *rt; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), }; u32 ret = 0; if (priv->flags & NFTA_FIB_F_IIF) dev = nft_in(pkt); else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) ret = RTN_LOCAL; route_err = nf_ip6_route(nft_net(pkt), (struct dst_entry **)&rt, flowi6_to_flowi(&fl6), false); if (route_err) goto err; if (rt->rt6i_flags & RTF_REJECT) { route_err = rt->dst.error; dst_release(&rt->dst); goto err; } if (ipv6_anycast_destination((struct dst_entry *)rt, &fl6.daddr)) ret = RTN_ANYCAST; else if (!dev && rt->rt6i_flags & RTF_LOCAL) ret = RTN_LOCAL; dst_release(&rt->dst); if (ret) return ret; addrtype = ipv6_addr_type(&fl6.daddr); if (addrtype & IPV6_ADDR_MULTICAST) return RTN_MULTICAST; if (addrtype & IPV6_ADDR_UNICAST) return RTN_UNICAST; return RTN_UNSPEC; err: switch (route_err) { case -EINVAL: return RTN_BLACKHOLE; case -EACCES: return RTN_PROHIBIT; case -EAGAIN: return RTN_THROW; default: break; } return RTN_UNREACHABLE; } void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); u32 *dest = &regs->data[priv->dreg]; struct ipv6hdr *iph, _iph; iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; return; } *dest = __nft_fib6_eval_type(priv, pkt, iph); } EXPORT_SYMBOL_GPL(nft_fib6_eval_type); static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const struct ipv6hdr *iph) { if (likely(next != IPPROTO_ICMPV6)) return false; if (ipv6_addr_type(&iph->saddr) != IPV6_ADDR_ANY) return false; return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL; } void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); const struct net_device *oif = NULL; u32 *dest = &regs->data[priv->dreg]; struct ipv6hdr *iph, _iph; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), .flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; struct rt6_info *rt; int lookup_flags; if (priv->flags & NFTA_FIB_F_IIF) oif = nft_in(pkt); else if (priv->flags & NFTA_FIB_F_OIF) oif = nft_out(pkt); iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; return; } lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); if (nft_hook(pkt) == NF_INET_PRE_ROUTING || nft_hook(pkt) == NF_INET_INGRESS) { if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) || nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { nft_fib_store_result(dest, priv, nft_in(pkt)); return; } } *dest = 0; rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, lookup_flags); if (rt->dst.error) goto put_rt_err; /* Should not see RTF_LOCAL here */ if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; if (oif && oif != rt->rt6i_idev->dev && l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) goto put_rt_err; nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); put_rt_err: ip6_rt_put(rt); } EXPORT_SYMBOL_GPL(nft_fib6_eval); static struct nft_expr_type nft_fib6_type; static const struct nft_expr_ops nft_fib6_type_ops = { .type = &nft_fib6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), .eval = nft_fib6_eval_type, .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, .reduce = nft_fib_reduce, }; static const struct nft_expr_ops nft_fib6_ops = { .type = &nft_fib6_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_fib)), .eval = nft_fib6_eval, .init = nft_fib_init, .dump = nft_fib_dump, .validate = nft_fib_validate, .reduce = nft_fib_reduce, }; static const struct nft_expr_ops * nft_fib6_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { enum nft_fib_result result; if (!tb[NFTA_FIB_RESULT]) return ERR_PTR(-EINVAL); result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT])); switch (result) { case NFT_FIB_RESULT_OIF: return &nft_fib6_ops; case NFT_FIB_RESULT_OIFNAME: return &nft_fib6_ops; case NFT_FIB_RESULT_ADDRTYPE: return &nft_fib6_type_ops; default: return ERR_PTR(-EOPNOTSUPP); } } static struct nft_expr_type nft_fib6_type __read_mostly = { .name = "fib", .select_ops = nft_fib6_select_ops, .policy = nft_fib_policy, .maxattr = NFTA_FIB_MAX, .family = NFPROTO_IPV6, .owner = THIS_MODULE, }; static int __init nft_fib6_module_init(void) { return nft_register_expr(&nft_fib6_type); } static void __exit nft_fib6_module_exit(void) { nft_unregister_expr(&nft_fib6_type); } module_init(nft_fib6_module_init); module_exit(nft_fib6_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_ALIAS_NFT_AF_EXPR(10, "fib"); MODULE_DESCRIPTION("nftables fib / ipv6 route lookup support");
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* BNEP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2001-2002 Inventel Systemes Written 2001-2002 by Clément Moreau <clement.moreau@inventel.fr> David Libault <david.libault@inventel.fr> Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/etherdevice.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include "bnep.h" #define BNEP_TX_QUEUE_LEN 20 static int bnep_net_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static int bnep_net_close(struct net_device *dev) { netif_stop_queue(dev); return 0; } static void bnep_net_set_mc_list(struct net_device *dev) { #ifdef CONFIG_BT_BNEP_MC_FILTER struct bnep_session *s = netdev_priv(dev); struct sock *sk = s->sock->sk; struct bnep_set_filter_req *r; struct sk_buff *skb; int size; BT_DBG("%s mc_count %d", dev->name, netdev_mc_count(dev)); size = sizeof(*r) + (BNEP_MAX_MULTICAST_FILTERS + 1) * ETH_ALEN * 2; skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { BT_ERR("%s Multicast list allocation failed", dev->name); return; } r = (void *) skb->data; __skb_put(skb, sizeof(*r)); r->type = BNEP_CONTROL; r->ctrl = BNEP_FILTER_MULTI_ADDR_SET; if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { u8 start[ETH_ALEN] = { 0x01 }; /* Request all addresses */ __skb_put_data(skb, start, ETH_ALEN); __skb_put_data(skb, dev->broadcast, ETH_ALEN); r->len = htons(ETH_ALEN * 2); } else { struct netdev_hw_addr *ha; int i, len = skb->len; if (dev->flags & IFF_BROADCAST) { __skb_put_data(skb, dev->broadcast, ETH_ALEN); __skb_put_data(skb, dev->broadcast, ETH_ALEN); } /* FIXME: We should group addresses here. */ i = 0; netdev_for_each_mc_addr(ha, dev) { if (i == BNEP_MAX_MULTICAST_FILTERS) break; __skb_put_data(skb, ha->addr, ETH_ALEN); __skb_put_data(skb, ha->addr, ETH_ALEN); i++; } r->len = htons(skb->len - len); } skb_queue_tail(&sk->sk_write_queue, skb); wake_up_interruptible(sk_sleep(sk)); #endif } static int bnep_net_set_mac_addr(struct net_device *dev, void *arg) { BT_DBG("%s", dev->name); return 0; } static void bnep_net_timeout(struct net_device *dev, unsigned int txqueue) { BT_DBG("net_timeout"); netif_wake_queue(dev); } #ifdef CONFIG_BT_BNEP_MC_FILTER static int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s) { struct ethhdr *eh = (void *) skb->data; if ((eh->h_dest[0] & 1) && !test_bit(bnep_mc_hash(eh->h_dest), (ulong *) &s->mc_filter)) return 1; return 0; } #endif #ifdef CONFIG_BT_BNEP_PROTO_FILTER /* Determine ether protocol. Based on eth_type_trans. */ static u16 bnep_net_eth_proto(struct sk_buff *skb) { struct ethhdr *eh = (void *) skb->data; u16 proto = ntohs(eh->h_proto); if (proto >= ETH_P_802_3_MIN) return proto; if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF)) return ETH_P_802_3; return ETH_P_802_2; } static int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session *s) { u16 proto = bnep_net_eth_proto(skb); struct bnep_proto_filter *f = s->proto_filter; int i; for (i = 0; i < BNEP_MAX_PROTO_FILTERS && f[i].end; i++) { if (proto >= f[i].start && proto <= f[i].end) return 0; } BT_DBG("BNEP: filtered skb %p, proto 0x%.4x", skb, proto); return 1; } #endif static netdev_tx_t bnep_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct bnep_session *s = netdev_priv(dev); struct sock *sk = s->sock->sk; BT_DBG("skb %p, dev %p", skb, dev); #ifdef CONFIG_BT_BNEP_MC_FILTER if (bnep_net_mc_filter(skb, s)) { kfree_skb(skb); return NETDEV_TX_OK; } #endif #ifdef CONFIG_BT_BNEP_PROTO_FILTER if (bnep_net_proto_filter(skb, s)) { kfree_skb(skb); return NETDEV_TX_OK; } #endif /* * We cannot send L2CAP packets from here as we are potentially in a bh. * So we have to queue them and wake up session thread which is sleeping * on the sk_sleep(sk). */ netif_trans_update(dev); skb_queue_tail(&sk->sk_write_queue, skb); wake_up_interruptible(sk_sleep(sk)); if (skb_queue_len(&sk->sk_write_queue) >= BNEP_TX_QUEUE_LEN) { BT_DBG("tx queue is full"); /* Stop queuing. * Session thread will do netif_wake_queue() */ netif_stop_queue(dev); } return NETDEV_TX_OK; } static const struct net_device_ops bnep_netdev_ops = { .ndo_open = bnep_net_open, .ndo_stop = bnep_net_close, .ndo_start_xmit = bnep_net_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_rx_mode = bnep_net_set_mc_list, .ndo_set_mac_address = bnep_net_set_mac_addr, .ndo_tx_timeout = bnep_net_timeout, }; void bnep_net_setup(struct net_device *dev) { eth_broadcast_addr(dev->broadcast); dev->addr_len = ETH_ALEN; ether_setup(dev); dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->netdev_ops = &bnep_netdev_ops; dev->watchdog_timeo = HZ * 2; }
50 49 3 1 1 19 19 3 3 44 44 57 58 59 59 2 2 2 2 14 14 17 17 2 2 9 9 9 9 1 1 133 127 5 27 150 150 62 2 60 2 2 2 12 9 1 2 2 2 2 7 2 1 2 2 2 2 8 1 1 4 2 2 1 2 1 3 1 1 1 1 1 3 3 3 3 13 1 1 1 9 1 5 1 1 1 1 1 6 1 1 1 1 1 1 1 1 8 3 5 1 2 6 6 6 5 1 1 1 1 1 1 3 1 2 7 1 1 1 1 2 13 2 1 1 1 1 1 13 3 1 4 17 17 1 1 1 1 1 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 /* * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "node.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "monitor.h" #include "discover.h" #include "netlink.h" #include "trace.h" #include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; struct tipc_link_entry { struct tipc_link *link; spinlock_t lock; /* per link */ u32 mtu; struct sk_buff_head inputq; struct tipc_media_addr maddr; }; struct tipc_bclink_entry { struct tipc_link *link; struct sk_buff_head inputq1; struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; u16 named_rcv_nxt; bool named_open; }; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @peer_id: 128-bit ID of peer * @peer_id_string: ID string of peer * @publ_list: list of publications * @conn_sks: list of connections (FIXME) * @timer: node's keepalive timer * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node * @peer_net: peer's net namespace * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; struct kref kref; rwlock_t lock; struct net *net; struct hlist_node hash; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; struct tipc_bclink_entry bc_entry; int action_flags; struct list_head list; int state; bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; u32 signature; u32 link_id; u8 peer_id[16]; char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; struct net *peer_net; u32 peer_hash_mix; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *crypto_rx; #endif }; /* Node FSM states and events: */ enum { SELF_DOWN_PEER_DOWN = 0xdd, SELF_UP_PEER_UP = 0xaa, SELF_DOWN_PEER_LEAVING = 0xd1, SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, NODE_FAILINGOVER = 0xf0, NODE_SYNCHING = 0xcc }; enum { SELF_ESTABL_CONTACT_EVT = 0xece, SELF_LOST_CONTACT_EVT = 0x1ce, PEER_ESTABL_CONTACT_EVT = 0x9ece, PEER_LOST_CONTACT_EVT = 0x91ce, NODE_FAILOVER_BEGIN_EVT = 0xfbe, NODE_FAILOVER_END_EVT = 0xfee, NODE_SYNCH_BEGIN_EVT = 0xcbe, NODE_SYNCH_END_EVT = 0xcee }; static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr); static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; u32 peer_port; u32 peer_node; struct list_head list; }; static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) return NULL; return n->links[bearer_id].link; } int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; /* Allow MAX_MSG_SIZE when building connection oriented message * if they are in the same core network */ if (n->peer_net && connected) { tipc_node_put(n); return mtu; } bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) { u8 *own_id = tipc_own_id(net); struct tipc_node *n; if (!own_id) return true; if (addr == tipc_own_addr(net)) { memcpy(id, own_id, TIPC_NODEID_LEN); return true; } n = tipc_node_find(net, addr); if (!n) return false; memcpy(id, &n->peer_id, TIPC_NODEID_LEN); tipc_node_put(n); return true; } u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; u16 caps; n = tipc_node_find(net, addr); if (unlikely(!n)) return TIPC_NODE_CAPABILITIES; caps = n->capabilities; tipc_node_put(n); return caps; } u32 tipc_node_get_addr(struct tipc_node *node) { return (node) ? node->addr : 0; } char *tipc_node_get_id_str(struct tipc_node *node) { return node->peer_id_string; } #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) { return (__n) ? __n->crypto_rx : NULL; } struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) { struct tipc_node *n; n = tipc_node_find(net, addr); return (n) ? n->crypto_rx : NULL; } #endif static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&n->crypto_rx); #endif kfree(n); } static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); call_rcu(&n->rcu, tipc_node_free); } void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } /* * tipc_node_find - locate specified node object, if it exists */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; break; } rcu_read_unlock(); return node; } /* tipc_node_find_by_id - locate specified node object by its 128-bit id * Note: this function is called only when a discovery request failed * to find the node by its 32-bit id, and is not time critical */ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool found = false; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { read_lock_bh(&n->lock); if (!memcmp(id, n->peer_id, 16) && kref_get_unless_zero(&n->kref)) found = true; read_unlock_bh(&n->lock); if (found) break; } rcu_read_unlock(); return found ? n : NULL; } static void tipc_node_read_lock(struct tipc_node *n) __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) __releases(n->lock) { struct tipc_socket_addr sk; struct net *net = n->net; u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; sk.node = tipc_own_addr(net); node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) { int net_id = tipc_netid(n->net); struct tipc_net *tn_peer; struct net *tmp; u32 hash_chk; if (n->peer_net) return; for_each_net_rcu(tmp) { tn_peer = tipc_net(tmp); if (!tn_peer) continue; /* Integrity checking whether node exists in namespace or not */ if (tn_peer->net_id != net_id) continue; if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) continue; hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); if (hash_mixes ^ hash_chk) continue; n->peer_net = tmp; n->peer_hash_mix = hash_mixes; break; } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr) ?: tipc_node_find_by_id(net, peer_id); if (n) { if (!n->preliminary) goto update; if (preliminary) goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link refresh failed, no memory\n"); tipc_node_write_unlock_fast(n); tipc_node_put(n); n = NULL; goto exit; } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_del_rcu(&n->list); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); tipc_node_write_unlock_fast(n); update: if (n->peer_hash_mix ^ hash_mixes) tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } tipc_node_write_unlock_fast(n); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } tipc_nodeid2string(n->peer_id_string, peer_id); #ifdef CONFIG_TIPC_CRYPTO if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); kfree(n); n = NULL; goto exit; } #endif n->addr = addr; n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; n->peer_net = NULL; n->peer_hash_mix = 0; /* Assign kernel local namespace if exists */ tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); INIT_HLIST_NODE(&n->hash); INIT_LIST_HEAD(&n->list); INIT_LIST_HEAD(&n->publ_list); INIT_LIST_HEAD(&n->conn_sks); skb_queue_head_init(&n->bc_entry.namedq); skb_queue_head_init(&n->bc_entry.inputq1); __skb_queue_head_init(&n->bc_entry.arrvq); skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; if (!preliminary && !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); tipc_node_put(n); n = NULL; goto exit; } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ n->keepalive_intv = 10000; intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; /* Link with lowest tolerance determines timer interval */ if (intv < n->keepalive_intv) n->keepalive_intv = intv; /* Ensure link's abort limit corresponds to current tolerance */ tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete_from_list(struct tipc_node *node) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_key_flush(node->crypto_rx); #endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); } static void tipc_node_delete(struct tipc_node *node) { trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); } void tipc_node_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); tipc_node_write_unlock_fast(n); tipc_node_put(n); } void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_del_init(subscr); tipc_node_write_unlock_fast(n); tipc_node_put(n); } int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; int err = 0; if (in_own_node(net, dnode)) return 0; node = tipc_node_find(net, dnode); if (!node) { pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) { struct tipc_node *node; struct tipc_sock_conn *conn, *safe; if (in_own_node(net, dnode)) return; node = tipc_node_find(net, dnode); if (!node) return; tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } tipc_node_write_unlock(node); tipc_node_put(node); } static void tipc_node_clear_links(struct tipc_node *node) { int i; for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link_entry *le = &node->links[i]; if (le->link) { kfree(le->link); le->link = NULL; node->link_cnt--; } } } /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; /* If lock held by tipc_node_stop() the node will be deleted anyway */ if (!spin_trylock_bh(&tn->node_list_lock)) return false; tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { tipc_node_clear_links(peer); tipc_node_delete_from_list(peer); deleted = true; } tipc_node_write_unlock(peer); if (!deleted) { spin_unlock_bh(&tn->node_list_lock); return deleted; } /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(peer->net, (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) { struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; int bearer_id; int rc = 0; trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); return; } #ifdef CONFIG_TIPC_CRYPTO /* Take any crypto key related actions first */ tipc_crypto_timeout(n->crypto_rx); #endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be * recalculated with link lowest tolerance */ tipc_node_read_lock(n); n->keepalive_intv = 10000; tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; if (le->link) { spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); spin_unlock_bh(&le->lock); remains--; } tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** * __tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); if (!tipc_link_is_up(nl)) return; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } /* Second link => redistribute slots */ if (tipc_link_prio(nl) > tipc_link_prio(ol)) { pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_media_addr *maddr; tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } /** * tipc_node_link_failover() - start failover in case "half-failover" * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. * This can happen when e.g.:: * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> * * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) * @tnl: tunnel link * @xmitq: queue for messages to be xmited on tnl link later */ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { /* Avoid to be "self-failover" that can never end */ if (!tipc_link_is_up(tnl)) return; /* Don't rush, failure link may be in the process of resetting */ if (l && !tipc_link_is_reset(l)) return; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_failover_prepare(l, tnl, xmitq); if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); } /** * __tipc_node_link_down - handle loss of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr) { struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; *slot1 = INVALID_BEARER_ID; for (i = 0; i < MAX_BEARERS; i++) { _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; if (_l == l) continue; prio = tipc_link_prio(_l); if (prio < highest) continue; if (prio > highest) { highest = prio; *slot0 = i; *slot1 = i; continue; } *slot1 = i; } if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); return; } tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ *bearer_id = n->active_links[0]; tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; int old_bearer_id = bearer_id; struct sk_buff_head xmitq; if (!l) return; __skb_queue_head_init(&xmitq); tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); } else { /* Defuse pending tipc_node_link_up() */ tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } if (delete) { kfree(l); le->link = NULL; n->link_cnt--; } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } bool tipc_node_is_up(struct net *net, u32 addr) { struct tipc_node *n; bool retval = false; if (in_own_node(net, addr)) return true; n = tipc_node_find(net, addr); if (!n) return false; retval = node_is_up(n); tipc_node_put(n); return retval; } static u32 tipc_node_suggest_addr(struct net *net, u32 addr) { struct tipc_node *n; addr ^= tipc_net(net)->random; while ((n = tipc_node_find(net, addr))) { tipc_node_put(n); addr++; } return addr; } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool preliminary; u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); if (n) { if (!memcmp(n->peer_id, id, NODE_ID_LEN)) addr = 0; tipc_node_put(n); if (!addr) return 0; return tipc_node_suggest_addr(net, addr); } /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { sugg_addr = n->addr; preliminary = n->preliminary; tipc_node_put(n); if (!preliminary) return sugg_addr; } /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); return 0; } void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool link_is_reset = false; bool accept_addr = false; bool reset = false; char *if_name; unsigned long intv; u16 session; *dupl_addr = false; *respond = false; n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, false); if (!n) return; tipc_node_write_lock(n); le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; } else if (sign_match && !addr_match && link_up) { /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. */ *dupl_addr = true; } else if (sign_match && !addr_match && !link_up) { /* Peer link has changed i/f address without rebooting. * It may also be a cloned or malicious peer; we can't * distinguish between the two. * The signature is correct, so we must accept. */ accept_addr = true; *respond = true; reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already * reset and re-established contact with the peer, before * receiving a discovery message from that node. * (The peer happened to receive one from this node first). * - The peer came back so fast that our side has not * discovered it yet. Probing from this side will soon * reset the link, since there can be no working link * endpoint at the peer end, and the link will re-establish. * Accept the signature, since it comes from a known peer. */ n->signature = signature; } else if (!sign_match && addr_match && !link_up) { /* The peer node has rebooted. * Accept signature, since it is a known peer. */ n->signature = signature; *respond = true; } else if (!sign_match && !addr_match && link_up) { /* Peer rebooted with new address, or a new/duplicate peer. * Ignore until the link goes down, if ever. */ *dupl_addr = true; } else if (!sign_match && !addr_match && !link_up) { /* Peer rebooted with new address, or it is a new peer. * Accept signature and address. */ n->signature = signature; accept_addr = true; *respond = true; reset = true; } if (!accept_addr) goto exit; /* Now create new link if not already existing */ if (!l) { if (n->link_cnt == 2) goto exit; if_name = strchr(b->name, ':') + 1; get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, &n->bc_entry.namedq, &l)) { *respond = false; goto exit; } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) { intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } static void tipc_node_reset_links(struct tipc_node *n) { int i; pr_warn("Resetting all links to %x\n", n->addr); trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } } /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; switch (state) { case SELF_DOWN_PEER_DOWN: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_COMING; break; case PEER_ESTABL_CONTACT_EVT: state = SELF_COMING_PEER_UP; break; case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_UP: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_BEGIN_EVT: state = NODE_SYNCHING; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_END_EVT: break; default: goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: switch (evt) { case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_COMING: switch (evt) { case PEER_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_BEGIN_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_COMING_PEER_UP: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case NODE_FAILINGOVER: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_FAILOVER_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_SYNCH_END_EVT: default: goto illegal_evt; } break; case NODE_SYNCHING: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case NODE_SYNCH_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; n->peer_net = NULL; n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } } /** * tipc_node_get_linkname - get the name of a link * * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer * @len: size of @linkname output buffer * * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); if (!node) return err; if (bearer_id >= MAX_BEARERS) goto exit; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { strncpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); exit: tipc_node_put(node); return err; } /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NODE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head inputq; switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (msg_connected(hdr) || msg_named(hdr) || msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; } if (msg_mcast(hdr)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); return; } return; case MSG_FRAGMENTER: if (tipc_msg_assemble(list)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); } return; case GROUP_PROTOCOL: case CONN_MANAGER: tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; case LINK_PROTOCOL: case NAME_DISTRIBUTOR: case TUNNEL_PROTOCOL: case BCAST_PROTOCOL: return; default: return; } } /** * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; struct net *peer_net; int bearer_id; int rc; if (in_own_node(net, dnode)) { tipc_loopback_trace(net, list); spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { __skb_queue_purge(list); return -EHOSTUNREACH; } rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); peer_net = n->peer_net; tipc_node_read_unlock(n); if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { rcu_read_unlock(); tipc_node_put(n); return 0; } } rcu_read_unlock(); tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); __skb_queue_purge(list); return -EHOSTUNREACH; } __skb_queue_head_init(&xmitq); le = &n->links[bearer_id]; spin_lock_bh(&le->lock); rc = tipc_link_xmit(le->link, list, &xmitq); spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); return rc; } /* tipc_node_xmit_skb(): send single buffer to destination * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; } /* tipc_node_distr_xmit(): send single buffer msgs to individual destinations * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected */ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) { struct sk_buff *skb; u32 selector, dnode; while ((skb = __skb_dequeue(xmitq))) { selector = msg_origport(buf_msg(skb)); dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, selector); } return 0; } void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; u16 dummy; u32 dst; /* Use broadcast if all nodes support it */ if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { __skb_queue_head_init(&xmitq); __skb_queue_tail(&xmitq, skb); tipc_bcast_xmit(net, &xmitq, &dummy); return; } /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; if (in_own_node(net, dst)) continue; if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) break; msg_set_destnode(buf_msg(txskb), dst); tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); kfree_skb(skb); } static void tipc_node_mcast_rcv(struct tipc_node *n) { struct tipc_bclink_entry *be = &n->bc_entry; /* 'arrvq' is under inputq2's lock protection */ spin_lock_bh(&be->inputq2.lock); spin_lock_bh(&be->inputq1.lock); skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); spin_unlock_bh(&be->inputq1.lock); spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); } static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_link *ucl; int rc; rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); return; } if (!(rc & TIPC_LINK_SND_STATE)) return; /* If probe message, a STATE response will be sent anyway */ if (msg_probe(hdr)) return; /* Produce a STATE message carrying broadcast NACK */ tipc_node_read_lock(n); ucl = n->links[bearer_id].link; if (ucl) tipc_link_build_state_msg(ucl, xmitq); tipc_node_read_unlock(n); } /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @bearer_id: id of bearer message arrived on * * Invoked with no locks held. */ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) { int rc; struct sk_buff_head xmitq; struct tipc_bclink_entry *be; struct tipc_link_entry *le; struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); u32 dnode = msg_destnode(hdr); struct tipc_node *n; __skb_queue_head_init(&xmitq); /* If NACK for other node, let rcv link for that node peek into it */ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) n = tipc_node_find(net, dnode); else n = tipc_node_find(net, msg_prevnode(hdr)); if (!n) { kfree_skb(skb); return; } be = &n->bc_entry; le = &n->links[bearer_id]; rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); tipc_node_put(n); } /** * tipc_node_check_state - check and if necessary update node state * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet * @xmitq: queue for messages to be xmited on * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int pb_id; if (trace_tipc_node_check_state_enabled()) { trace_tipc_skb_dump(skb, false, "skb for node state check"); trace_tipc_node_check_state(n, true, " "); } l = n->links[bearer_id].link; if (!l) return false; rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { if ((pb_id != bearer_id) && n->links[pb_id].link) { pl = n->links[pb_id].link; break; } } if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; if (!msg_peer_link_is_up(hdr)) return true; tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); } if (state == SELF_DOWN_PEER_LEAVING) { if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); return true; } if (state == SELF_LEAVING_PEER_DOWN) return false; /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } /* If parallel link was already down, and this happened before * the tunnel link came up, node failover was never started. * Ensure that a FAILOVER_MSG is sent to get peer out of * NODE_FAILINGOVER state, also this node must accept * TUNNEL_MSGs from peer. */ if (n->state != NODE_FAILINGOVER) tipc_node_link_failover(n, pl, l, xmitq); /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { if (n->capabilities & TIPC_TUNNEL_ENHANCED) syncpt = msg_syncpt(hdr); else syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } } /* Open tunnel link when parallel link reaches synch point */ if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } inputq_len = skb_queue_len(tipc_link_inputq(pl)); dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } if (l == pl) return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) return true; return false; } return true; } /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_link_entry *le; struct tipc_msg *hdr; struct tipc_node *n; int bearer_id = b->identity; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; #ifdef CONFIG_TIPC_CRYPTO struct tipc_ehdr *ehdr; /* Check if message must be decrypted first */ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) goto rcv; ehdr = (struct tipc_ehdr *)skb->data; if (likely(ehdr->user != LINK_CONFIG)) { n = tipc_node_find(net, ntohl(ehdr->addr)); if (unlikely(!n)) goto discard; } else { n = tipc_node_find_by_id(net, ehdr->id); } skb_dst_force(skb); tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; rcv: #endif /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(&skb))) goto discard; __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { if (unlikely(usr == LINK_CONFIG)) return tipc_disc_rcv(net, skb, b); else return tipc_node_bc_rcv(net, skb, bearer_id); } /* Discard unicast link messages destined for another node */ if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) goto discard; /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) { if (unlikely(skb_linearize(skb))) { tipc_node_put(n); goto discard; } hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { spin_lock_bh(&le->lock); if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } spin_unlock_bh(&le->lock); } tipc_node_read_unlock(n); /* Check/update node state before receiving */ if (unlikely(skb)) { if (unlikely(skb_linearize(skb))) goto out_node_put; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } } tipc_node_write_unlock(n); } if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); out_node_put: tipc_node_put(n); discard: kfree_skb(skb); } void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; struct sk_buff_head xmitq; struct tipc_link_entry *e; struct tipc_node *n; __skb_queue_head_init(&xmitq); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; if (e->link) { if (prop == TIPC_NLA_PROP_TOL) tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); /* Update MTU for node link entry */ e->mtu = tipc_link_mss(e->link); } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); } int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are * mutually exclusive cases */ if (attrs[TIPC_NLA_NET_ADDR]) { addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; } if (attrs[TIPC_NLA_NET_NODEID]) { if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); addr = hash128to32(node_id); } if (in_own_node(net, addr)) return -ENOTSUPP; spin_lock_bh(&tn->node_list_lock); peer = tipc_node_find(net, addr); if (!peer) { spin_unlock_bh(&tn->node_list_lock); return -ENXIO; } tipc_node_write_lock(peer); if (peer->state != SELF_DOWN_PEER_DOWN && peer->state != SELF_DOWN_PEER_LEAVING) { tipc_node_write_unlock(peer); err = -EBUSY; goto err_out; } tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); spin_unlock_bh(&tn->node_list_lock); return err; } int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); int done = cb->args[0]; int last_addr = cb->args[1]; struct tipc_node *node; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (last_addr) { node = tipc_node_find(net, last_addr); if (!node) { rcu_read_unlock(); /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the NLMSG_DONE message having * the NLM_F_DUMP_INTR flag set if the node state * changed while we released the lock. */ cb->prev_seq = 1; return -EPIPE; } tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { if (node->preliminary) continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; else continue; } tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; tipc_node_read_unlock(node); goto out; } tipc_node_read_unlock(node); } done = 1; out: cb->args[0] = done; cb->args[1] = last_addr; rcu_read_unlock(); return skb->len; } /* tipc_node_find_by_name - locate owner node of link by link's name * @net: the applicable net namespace * @name: pointer to link name string * @bearer_id: pointer to index in 'node->links' array where the link was found. * * Returns pointer to node owning the link, or 0 if no matching link is found. */ static struct tipc_node *tipc_node_find_by_name(struct net *net, const char *link_name, unsigned int *bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l; struct tipc_node *n; struct tipc_node *found_node = NULL; int i; *bearer_id = 0; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_read_lock(n); for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l && !strcmp(tipc_link_name(l), link_name)) { *bearer_id = i; found_node = n; break; } } tipc_node_read_unlock(n); if (found_node) break; } rcu_read_unlock(); return found_node; } int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) { int err; int res = 0; int bearer_id; char *name; struct tipc_link *link; struct tipc_node *node; struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); __skb_queue_head_init(&xmitq); if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); if (strcmp(name, tipc_bclink_name) == 0) return tipc_nl_bc_link_set(net, attrs); node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; } if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); tipc_link_set_queue_limits(link, tipc_link_min_win(link), max_win); } } out: tipc_node_read_unlock(node); tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, NULL); return res; } int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct tipc_nl_msg msg; char *name; int err; msg.portid = info->snd_portid; msg.seq = info->snd_seq; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) { err = -EINVAL; goto err_free; } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); err = -EINVAL; goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); if (err) goto err_free; } return genlmsg_reply(msg.skb, info); err_free: nlmsg_free(msg.skb); return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) { int err; char *link_name; unsigned int bearer_id; struct tipc_link *link; struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); err = -EINVAL; if (!strcmp(link_name, tipc_bclink_name)) { err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; } else if (strstr(link_name, tipc_bclink_name)) { rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); link = node->bc_entry.link; if (link && !strcmp(link_name, tipc_link_name(link))) { err = tipc_bclink_reset_stats(net, link); tipc_node_read_unlock(node); break; } tipc_node_read_unlock(node); } rcu_read_unlock(); return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); if (!node) return -EINVAL; le = &node->links[bearer_id]; tipc_node_read_lock(node); spin_lock_bh(&le->lock); link = node->links[bearer_id].link; if (!link) { spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return -EINVAL; } tipc_link_reset_stats(link); spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return 0; } /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, struct tipc_node *node, u32 *prev_link, bool bc_link) { u32 i; int err; for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; if (!node->links[i].link) continue; err = __tipc_nl_add_link(net, msg, node->links[i].link, NLM_F_MULTI); if (err) return err; } if (bc_link) { *prev_link = i; err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); if (err) return err; } *prev_link = 0; return 0; } int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; bool bc_link = cb->args[3]; int err; if (done) return 0; if (!prev_node) { /* Check if broadcast-receiver links dumping is needed */ if (attrs && attrs[TIPC_NLA_LINK]) { err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], tipc_nl_link_policy, NULL); if (unlikely(err)) return err; if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) return -EINVAL; bc_link = true; } } msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; goto out; } tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } else { err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } done = 1; out: rcu_read_unlock(); cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; cb->args[3] = bc_link; return skb->len; } int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; struct net *net = sock_net(skb->sk); int err; if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, info->extack); if (err) return err; if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); err = tipc_nl_monitor_set_threshold(net, val); if (err) return err; } return 0; } static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) { struct nlattr *attrs; void *hdr; u32 val; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; val = tipc_nl_monitor_get_threshold(net); if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_add_monitor_prop(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_bearer = cb->args[0]; struct tipc_nl_msg msg; int bearer_id; int err; if (prev_bearer == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } rtnl_unlock(); cb->args[0] = bearer_id; return skb->len; } int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_node = cb->args[1]; u32 bearer_id = cb->args[2]; int done = cb->args[0]; struct tipc_nl_msg msg; int err; if (!prev_node) { struct nlattr **attrs = genl_dumpit_info(cb)->info.attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, NULL); if (err) return err; if (!mon[TIPC_NLA_MON_REF]) return -EINVAL; bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); if (bearer_id >= MAX_BEARERS) return -EINVAL; } if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); if (!err) done = 1; rtnl_unlock(); cb->args[0] = done; cb->args[1] = prev_node; cb->args[2] = bearer_id; return skb->len; } #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; struct tipc_aead_key *key; if (!attr) return -ENODATA; if (nla_len(attr) < sizeof(*key)) return -EINVAL; key = (struct tipc_aead_key *)nla_data(attr); if (key->keylen > TIPC_AEAD_KEYLEN_MAX || nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; *pkey = key; return 0; } static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) { struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; if (!attr) return -ENODATA; if (nla_len(attr) < TIPC_NODEID_LEN) return -EINVAL; *node_id = (u8 *)nla_data(attr); return 0; } static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) { struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; if (!attr) return -ENODATA; *intv = nla_get_u32(attr); return 0; } static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; bool rekeying = true, master_key = false; u8 *id, *own_id, mode; u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) return -EINVAL; rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) return rc; own_id = tipc_own_id(net); if (!own_id) { GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); return -EPERM; } rc = tipc_nl_retrieve_rekeying(attrs, &intv); if (rc == -ENODATA) rekeying = false; rc = tipc_nl_retrieve_key(attrs, &ukey); if (rc == -ENODATA && rekeying) goto rekeying; else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); if (rc) return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: mode = CLUSTER_KEY; master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); if (unlikely(!n)) return -ENOMEM; c = n->crypto_rx; } break; default: return rc; } /* Initiate the TX/RX key */ rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } else if (c == tx) { /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); rekeying: /* Schedule TX rekeying if needed */ tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_set_key(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_node *n; tipc_crypto_key_flush(tn->crypto_tx); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); return 0; } int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_flush_key(skb, info); rtnl_unlock(); return err; } #endif /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped * @more: dump more? * - false: dump only tipc node data * - true: dump node link data as well * @buf: returned buffer of dump data in format */ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) { int i = 0; size_t sz = (more) ? NODE_LMAX : NODE_LMIN; if (!n) { i += scnprintf(buf, sz, "node data: (null)\n"); return i; } i += scnprintf(buf, sz, "node data: %x", n->addr); i += scnprintf(buf + i, sz - i, " %x", n->state); i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); i += scnprintf(buf + i, sz - i, " %x", n->action_flags); i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); i += scnprintf(buf + i, sz - i, " %u", n->sync_point); i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); i += scnprintf(buf + i, sz - i, " %u", n->working_links); i += scnprintf(buf + i, sz - i, " %x", n->capabilities); i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); if (!more) return i; i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[0].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[1].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "bclink:\n "); i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); return i; } void tipc_node_pre_cleanup_net(struct net *exit_net) { struct tipc_node *n; struct tipc_net *tn; struct net *tmp; rcu_read_lock(); for_each_net_rcu(tmp) { if (tmp == exit_net) continue; tn = tipc_net(tmp); if (!tn) continue; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_rcu(n, &tn->node_list, list) { if (!n->peer_net) continue; if (n->peer_net != exit_net) continue; tipc_node_write_lock(n); n->peer_net = NULL; n->peer_hash_mix = 0; tipc_node_write_unlock_fast(n); break; } spin_unlock_bh(&tn->node_list_lock); } rcu_read_unlock(); }
209 3607 2097 2287 21 1610 2308 4 401 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FILELOCK_H #define _LINUX_FILELOCK_H #include <linux/fs.h> #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_DELEG 4 /* NFSv4 delegation */ #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_RECLAIM 4096 /* reclaiming from a reboot server */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. */ #define FILE_LOCK_DEFERRED 1 struct file_lock; struct file_lease; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; struct lock_manager_operations { void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); bool (*lm_lock_expirable)(struct file_lock *cfl); void (*lm_expire_lock)(void); }; struct lease_manager_operations { bool (*lm_break)(struct file_lease *); int (*lm_change)(struct file_lease *, int, struct list_head *); void (*lm_setup)(struct file_lease *, void **); bool (*lm_breaker_owns_lease)(struct file_lease *); }; struct lock_manager { struct list_head list; /* * NFSv4 and up also want opens blocked during the grace period; * NLM doesn't care: */ bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); bool locks_in_grace(struct net *); bool opens_in_grace(struct net *); /* * struct file_lock has a union that some filesystems use to track * their own private info. The NFS side of things is defined here: */ #include <linux/nfs_fs_i.h> /* * struct file_lock represents a generic "file lock". It's used to represent * POSIX byte range locks, BSD (flock) locks, and leases. It's important to * note that the same struct is used to represent both a request for a lock and * the lock itself, but the same object is never used for both. * * FIXME: should we create a separate "struct lock_request" to help distinguish * these two uses? * * The varous i_flctx lists are ordered by: * * 1) lock owner * 2) lock range start * 3) lock range end * * Obviously, the last two criteria only matter for POSIX locks. */ struct file_lock_core { struct file_lock_core *flc_blocker; /* The lock that is blocking us */ struct list_head flc_list; /* link into file_lock_context */ struct hlist_node flc_link; /* node in global lists */ struct list_head flc_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ struct list_head flc_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ fl_owner_t flc_owner; unsigned int flc_flags; unsigned char flc_type; pid_t flc_pid; int flc_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t flc_wait; struct file *flc_file; }; struct file_lock { struct file_lock_core c; loff_t fl_start; loff_t fl_end; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; struct { struct list_head link; /* link in AFS vnode's pending_locks list */ int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; struct { struct inode *inode; } ceph; } fl_u; } __randomize_layout; struct file_lease { struct file_lock_core c; struct fasync_struct * fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */ } __randomize_layout; struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; struct list_head flc_posix; struct list_head flc_lease; }; #ifdef CONFIG_FILE_LOCKING int fcntl_getlk(struct file *, unsigned int, struct flock *); int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock *); #if BITS_PER_LONG == 32 int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); static inline bool lock_is_unlock(struct file_lock *fl) { return fl->c.flc_type == F_UNLCK; } static inline bool lock_is_read(struct file_lock *fl) { return fl->c.flc_type == F_RDLCK; } static inline bool lock_is_write(struct file_lock *fl) { return fl->c.flc_type == F_WRLCK; } static inline void locks_wake_up(struct file_lock *fl) { wake_up(&fl->c.flc_wait); } static inline bool locks_can_async_lock(const struct file_operations *fops) { return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK; } /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); void locks_init_lock(struct file_lock *); struct file_lock *locks_alloc_lock(void); void locks_copy_lock(struct file_lock *, struct file_lock *); void locks_copy_conflock(struct file_lock *, struct file_lock *); void locks_remove_posix(struct file *, fl_owner_t); void locks_remove_file(struct file *); void locks_release_private(struct file_lock *); void posix_test_lock(struct file *, struct file_lock *); int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); int locks_delete_block(struct file_lock *); int vfs_test_lock(struct file *, struct file_lock *); int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); int vfs_cancel_lock(struct file *filp, struct file_lock *fl); bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); int vfs_setlease(struct file *, int, struct file_lease **, void **); int lease_modify(struct file_lease *, int, struct list_head *); struct notifier_block; int lease_register_notifier(struct notifier_block *); void lease_unregister_notifier(struct notifier_block *); struct files_struct; void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner); static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return smp_load_acquire(&inode->i_flctx); } #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) { return -EINVAL; } static inline int fcntl_setlk(unsigned int fd, struct file *file, unsigned int cmd, struct flock __user *user) { return -EACCES; } #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, struct flock64 *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, unsigned int cmd, struct flock64 *user) { return -EACCES; } #endif static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { return -EINVAL; } static inline int fcntl_getlease(struct file *filp) { return F_UNLCK; } static inline bool lock_is_unlock(struct file_lock *fl) { return false; } static inline bool lock_is_read(struct file_lock *fl) { return false; } static inline bool lock_is_write(struct file_lock *fl) { return false; } static inline void locks_wake_up(struct file_lock *fl) { } static inline void locks_free_lock_context(struct inode *inode) { } static inline void locks_init_lock(struct file_lock *fl) { return; } static inline void locks_init_lease(struct file_lease *fl) { return; } static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { return; } static inline void locks_remove_file(struct file *filp) { return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return -ENOLCK; } static inline int locks_delete_block(struct file_lock *waiter) { return -ENOENT; } static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { return -ENOLCK; } static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline bool vfs_inode_has_locks(struct inode *inode) { return false; } static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; } static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec64 *time) { return; } static inline int generic_setlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; } static inline int kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv) { return -EINVAL; } static inline int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) { return -EINVAL; } struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner) { return false; } static inline struct file_lock_context * locks_inode_context(const struct inode *inode) { return NULL; } #endif /* !CONFIG_FILE_LOCKING */ /* for walking lists of file_locks linked by fl_list */ #define for_each_file_lock(_fl, _head) list_for_each_entry(_fl, _head, c.flc_list) static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(file_inode(filp), fl); } #ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ flctx = READ_ONCE(inode->i_flctx); if (!flctx) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_DELEG); return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { int ret; ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); } return ret; } static inline int break_deleg_wait(struct inode **delegated_inode) { int ret; ret = break_deleg(*delegated_inode, O_WRONLY); iput(*delegated_inode); *delegated_inode = NULL; return ret; } static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) return __break_lease(inode, wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, FL_LAYOUT); return 0; } #else /* !CONFIG_FILE_LOCKING */ static inline int break_lease(struct inode *inode, unsigned int mode) { return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { return 0; } static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) { return 0; } static inline int break_deleg_wait(struct inode **delegated_inode) { BUG(); return 0; } static inline int break_layout(struct inode *inode, bool wait) { return 0; } #endif /* CONFIG_FILE_LOCKING */ #endif /* _LINUX_FILELOCK_H */
81 145 2 9 2 1 40 85 85 85 108 42 42 42 103 24 36 9 62 42 83 67 23 44 64 64 86 176 107 176 110 42 103 21 171 4651 4665 4645 4623 120 4666 335 333 109 109 109 6 105 105 109 3 109 44 44 44 44 44 44 12 44 104 105 105 105 44 105 31 5 7 26 4 2 45 1 3 2 1 44 40 3 34 34 26 7 44 63 63 63 7 56 75 33 66 4 61 55 1 1 53 1 3 2 1 1 53 53 52 48 3 52 52 4 47 1 42 13 1 4 4 4 25 2 10 12 15 25 25 18 1 17 16 16 16 3 13 11 12 12 6 6 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/mlock.c * * (C) Copyright 1995 Linus Torvalds * (C) Copyright 2002 Christoph Hellwig */ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> #include <linux/sched/user.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/pagewalk.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/export.h> #include <linux/rmap.h> #include <linux/mmzone.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/secretmem.h> #include "internal.h" struct mlock_fbatch { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = { .lock = INIT_LOCAL_LOCK(lock), }; bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) return true; if (capable(CAP_IPC_LOCK)) return true; return false; } EXPORT_SYMBOL(can_do_mlock); /* * Mlocked folios are marked with the PG_mlocked flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate * statistics. * * An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it * will be ostensibly placed on the LRU "unevictable" list (actually no such * list exists), rather than the [in]active lists. PG_unevictable is set to * indicate the unevictable state. */ static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec) { /* There is nothing more we can do while it's off LRU */ if (!folio_test_clear_lru(folio)) return lruvec; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (unlikely(folio_evictable(folio))) { /* * This is a little surprising, but quite possible: PG_mlocked * must have got cleared already by another CPU. Could this * folio be unevictable? I'm not sure, but move it now if so. */ if (folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, folio_nr_pages(folio)); } goto out; } if (folio_test_unevictable(folio)) { if (folio_test_mlocked(folio)) folio->mlock_count++; goto out; } lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: folio_set_lru(folio); return lruvec; } static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); lruvec = folio_lruvec_relock_irq(folio, lruvec); /* As above, this is a little surprising, but possible */ if (unlikely(folio_evictable(folio))) goto out; folio_set_unevictable(folio); folio->mlock_count = !!folio_test_mlocked(folio); __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio)); out: lruvec_add_folio(lruvec, folio); folio_set_lru(folio); return lruvec; } static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec) { int nr_pages = folio_nr_pages(folio); bool isolated = false; if (!folio_test_clear_lru(folio)) goto munlock; isolated = true; lruvec = folio_lruvec_relock_irq(folio, lruvec); if (folio_test_unevictable(folio)) { /* Then mlock_count is maintained, but might undercount */ if (folio->mlock_count) folio->mlock_count--; if (folio->mlock_count) goto out; } /* else assume that was the last mlock: reclaim will fix it if not */ munlock: if (folio_test_clear_mlocked(folio)) { __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); if (isolated || !folio_test_unevictable(folio)) __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); else __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } /* folio_evictable() has to be checked *after* clearing Mlocked */ if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_unevictable(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } out: if (isolated) folio_set_lru(folio); return lruvec; } /* * Flags held in the low bits of a struct folio pointer on the mlock_fbatch. */ #define LRU_FOLIO 0x1 #define NEW_FOLIO 0x2 static inline struct folio *mlock_lru(struct folio *folio) { return (struct folio *)((unsigned long)folio + LRU_FOLIO); } static inline struct folio *mlock_new(struct folio *folio) { return (struct folio *)((unsigned long)folio + NEW_FOLIO); } /* * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can * make use of such folio pointer flags in future, but for now just keep it for * mlock. We could use three separate folio batches instead, but one feels * better (munlocking a full folio batch does not need to drain mlocking folio * batches first). */ static void mlock_folio_batch(struct folio_batch *fbatch) { struct lruvec *lruvec = NULL; unsigned long mlock; struct folio *folio; int i; for (i = 0; i < folio_batch_count(fbatch); i++) { folio = fbatch->folios[i]; mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO); folio = (struct folio *)((unsigned long)folio - mlock); fbatch->folios[i] = folio; if (mlock & LRU_FOLIO) lruvec = __mlock_folio(folio, lruvec); else if (mlock & NEW_FOLIO) lruvec = __mlock_new_folio(folio, lruvec); else lruvec = __munlock_folio(folio, lruvec); } if (lruvec) unlock_page_lruvec_irq(lruvec); folios_put(fbatch); } void mlock_drain_local(void) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } void mlock_drain_remote(int cpu) { struct folio_batch *fbatch; WARN_ON_ONCE(cpu_online(cpu)); fbatch = &per_cpu(mlock_fbatch.fbatch, cpu); if (folio_batch_count(fbatch)) mlock_folio_batch(fbatch); } bool need_mlock_drain(int cpu) { return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu)); } /** * mlock_folio - mlock a folio already on (or temporarily off) LRU * @folio: folio to be mlocked. */ void mlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); if (!folio_test_set_mlocked(folio)) { int nr_pages = folio_nr_pages(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * mlock_new_folio - mlock a newly allocated folio not yet on LRU * @folio: folio to be mlocked, either normal or a THP head. */ void mlock_new_folio(struct folio *folio) { struct folio_batch *fbatch; int nr_pages = folio_nr_pages(folio); local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); folio_set_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } /** * munlock_folio - munlock a folio * @folio: folio to be munlocked, either normal or a THP head. */ void munlock_folio(struct folio *folio) { struct folio_batch *fbatch; local_lock(&mlock_fbatch.lock); fbatch = this_cpu_ptr(&mlock_fbatch.fbatch); /* * folio_test_clear_mlocked(folio) must be left to __munlock_folio(), * which will check whether the folio is multiply mlocked. */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || folio_test_large(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; unsigned int count = (end - addr) >> PAGE_SHIFT; pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL, NULL, NULL); } static inline bool allow_mlock_munlock(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned int step) { /* * For unlock, allow munlock large folio which is partially * mapped to VMA. As it's possible that large folio is * mlocked and VMA is split later. * * During memory pressure, such kind of large folio can * be split. And the pages are not in VM_LOCKed VMA * can be reclaimed. */ if (!(vma->vm_flags & VM_LOCKED)) return true; /* folio_within_range() cannot take KSM, but any small folio is OK */ if (!folio_test_large(folio)) return true; /* folio not in range [start, end), skip mlock */ if (!folio_within_range(folio, vma, start, end)) return false; /* folio is not fully mapped, skip mlock */ if (step != folio_nr_pages(folio)) return false; return true; } static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; unsigned int step = 1; unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (!pmd_present(*pmd)) goto out; if (is_huge_zero_pmd(*pmd)) goto out; folio = pmd_folio(*pmd); if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); goto out; } start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (!start_pte) { walk->action = ACTION_AGAIN; return 0; } for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) continue; folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; step = folio_mlock_step(folio, pte, addr, end); if (!allow_mlock_munlock(folio, vma, start, end, step)) goto next_entry; if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); next_entry: pte += step - 1; addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: spin_unlock(ptl); cond_resched(); return 0; } /* * mlock_vma_pages_range() - mlock any pages already in the range, * or munlock all pages in the range. * @vma - vma containing range to be mlock()ed or munlock()ed * @start - start address in @vma of the range * @end - end of range in @vma * @newflags - the new set of flags for @vma. * * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. */ static void mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t newflags) { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, .walk_lock = PGWALK_WRLOCK_VERIFY, }; /* * There is a slight chance that concurrent page migration, * or page reclaim finding a page of this now-VM_LOCKED vma, * will call mlock_vma_folio() and raise page's mlock_count: * double counting, leaving the page unevictable indefinitely. * Communicate this danger to mlock_vma_folio() with VM_IO, * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. * mmap_lock is held in write mode here, so this weird * combination should not be visible to other mmap_lock users; * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. */ if (newflags & VM_LOCKED) newflags |= VM_IO; vma_start_write(vma); vm_flags_reset_once(vma, newflags); lru_add_drain(); walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); lru_add_drain(); if (newflags & VM_IO) { newflags &= ~VM_IO; vm_flags_reset_once(vma, newflags); } } /* * mlock_fixup - handle mlock[all]/munlock[all] requests. * * Filters out "special" vmas -- VM_LOCKED never gets set for these, and * munlock is a no-op. However, for some special vmas, we go ahead and * populate the ptes. * * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; if (newflags == oldflags || (oldflags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; } /* * Keep track of amount of locked VM. */ nr_pages = (end - start) >> PAGE_SHIFT; if (!(newflags & VM_LOCKED)) nr_pages = -nr_pages; else if (oldflags & VM_LOCKED) nr_pages = 0; mm->locked_vm += nr_pages; /* * vm_flags is protected by the mmap_lock held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ vma_start_write(vma); vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); } out: *prev = vma; return ret; } static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; vma = vma_iter_load(&vmi); if (!vma) return -ENOMEM; prev = vma_prev(&vmi); if (start > vma->vm_start) prev = vma; nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { int error; vm_flags_t newflags; if (vma->vm_start != tmp) return -ENOMEM; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= flags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) return error; tmp = vma_iter_end(&vmi); nstart = tmp; } if (tmp < end) return -ENOMEM; return 0; } /* * Go through vma areas and sum size of mlocked * vma pages, as return value. * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) * is also counted. * Return value: previously mlocked page counts */ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; unsigned long count = 0; unsigned long end; VMA_ITERATOR(vmi, mm, start); /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); if (end < vma->vm_end) { count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; } } return count >> PAGE_SHIFT; } /* * convert get_user_pages() return value to posix mlock() error */ static int __mlock_posix_error_return(long retval) { if (retval == -EFAULT) retval = -ENOMEM; else if (retval == -ENOMEM) retval = -EAGAIN; return retval; } static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; int error = -ENOMEM; start = untagged_addr(start); if (!can_do_mlock()) return -EPERM; len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; locked += current->mm->locked_vm; if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { /* * It is possible that the regions requested intersect with * previously mlocked areas, that part area in "mm->locked_vm" * should not be counted to new mlock increment count. So check * and adjust locked count if necessary. */ locked -= count_mm_mlocked_page_nr(current->mm, start, len); } /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = apply_vma_lock_flags(start, len, flags); mmap_write_unlock(current->mm); if (error) return error; error = __mm_populate(start, len, 0); if (error) return __mlock_posix_error_return(error); return 0; } SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { return do_mlock(start, len, VM_LOCKED); } SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) { vm_flags_t vm_flags = VM_LOCKED; if (flags & ~MLOCK_ONFAULT) return -EINVAL; if (flags & MLOCK_ONFAULT) vm_flags |= VM_LOCKONFAULT; return do_mlock(start, len, vm_flags); } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; start = untagged_addr(start); len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_vma_lock_flags(start, len, 0); mmap_write_unlock(current->mm); return ret; } /* * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) * and translate into the appropriate modifications to mm->def_flags and/or the * flags for all current VMAs. * * There are a couple of subtleties with this. If mlockall() is called multiple * times with different flags, the values do not necessarily stack. If mlockall * is called once including the MCL_FUTURE flag and then a second time without * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. */ static int apply_mlockall_flags(int flags) { VMA_ITERATOR(vmi, current->mm, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= ~VM_LOCKED_MASK; if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; if (flags & MCL_ONFAULT) current->mm->def_flags |= VM_LOCKONFAULT; if (!(flags & MCL_CURRENT)) goto out; } if (flags & MCL_CURRENT) { to_add |= VM_LOCKED; if (flags & MCL_ONFAULT) to_add |= VM_LOCKONFAULT; } for_each_vma(vmi, vma) { int error; vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, newflags); /* Ignore errors, but prev needs fixing up. */ if (error) prev = vma; cond_resched(); } out: return 0; } SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret; if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || flags == MCL_ONFAULT) return -EINVAL; if (!can_do_mlock()) return -EPERM; lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); mmap_write_unlock(current->mm); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); return ret; } SYSCALL_DEFINE0(munlockall) { int ret; if (mmap_write_lock_killable(current->mm)) return -EINTR; ret = apply_mlockall_flags(0); mmap_write_unlock(current->mm); return ret; } /* * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB * shm segments) get accounted against the user_struct instead. */ static DEFINE_SPINLOCK(shmlock_user_lock); int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit != RLIM_INFINITY) lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; } if (!get_ucounts(ucounts)) { dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); allowed = 0; goto out; } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); put_ucounts(ucounts); }
2288 2291 2293 2293 2285 2294 2290 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 // SPDX-License-Identifier: GPL-2.0 /* * SHA1 routine optimized to do word accesses rather than byte accesses, * and to avoid unnecessary copies into the context array. * * This was based on the git SHA1 implementation. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/string.h> #include <crypto/sha1.h> #include <linux/unaligned.h> /* * If you have 32 registers or more, the compiler can (and should) * try to change the array[] accesses into registers. However, on * machines with less than ~25 registers, that won't really work, * and at least gcc will make an unholy mess of it. * * So to avoid that mess which just slows things down, we force * the stores to memory to actually happen (we might be better off * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as * suggested by Artur Skawina - that will also make gcc unable to * try to do the silly "optimize away loads" part because it won't * see what the value will be). * * Ben Herrenschmidt reports that on PPC, the C version comes close * to the optimized asm with this (ie on PPC you don't want that * 'volatile', since there are lots of registers). * * On ARM we get the best code generation by forcing a full memory barrier * between each SHA_ROUND, otherwise gcc happily get wild with spilling and * the stack frame size simply explode and performance goes down the drain. */ #ifdef CONFIG_X86 #define setW(x, val) (*(volatile __u32 *)&W(x) = (val)) #elif defined(CONFIG_ARM) #define setW(x, val) do { W(x) = (val); __asm__("":::"memory"); } while (0) #else #define setW(x, val) (W(x) = (val)) #endif /* This "rolls" over the 512-bit array */ #define W(x) (array[(x)&15]) /* * Where do we get the source from? The first 16 iterations get it from * the input data, the next mix it from the 512-bit array. */ #define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t) #define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1) #define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \ __u32 TEMP = input(t); setW(t, TEMP); \ E += TEMP + rol32(A,5) + (fn) + (constant); \ B = ror32(B, 2); \ TEMP = E; E = D; D = C; C = B; B = A; A = TEMP; } while (0) #define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E ) #define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E ) #define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E ) #define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E ) /** * sha1_transform - single block SHA1 transform (deprecated) * * @digest: 160 bit digest to update * @data: 512 bits of data to hash * @array: 16 words of workspace (see note) * * This function executes SHA-1's internal compression function. It updates the * 160-bit internal state (@digest) with a single 512-bit data block (@data). * * Don't use this function. SHA-1 is no longer considered secure. And even if * you do have to use SHA-1, this isn't the correct way to hash something with * SHA-1 as this doesn't handle padding and finalization. * * Note: If the hash is security sensitive, the caller should be sure * to clear the workspace. This is left to the caller to avoid * unnecessary clears between chained hashing operations. */ void sha1_transform(__u32 *digest, const char *data, __u32 *array) { __u32 A, B, C, D, E; unsigned int i = 0; A = digest[0]; B = digest[1]; C = digest[2]; D = digest[3]; E = digest[4]; /* Round 1 - iterations 0-16 take their input from 'data' */ for (; i < 16; ++i) T_0_15(i, A, B, C, D, E); /* Round 1 - tail. Input from 512-bit mixing array */ for (; i < 20; ++i) T_16_19(i, A, B, C, D, E); /* Round 2 */ for (; i < 40; ++i) T_20_39(i, A, B, C, D, E); /* Round 3 */ for (; i < 60; ++i) T_40_59(i, A, B, C, D, E); /* Round 4 */ for (; i < 80; ++i) T_60_79(i, A, B, C, D, E); digest[0] += A; digest[1] += B; digest[2] += C; digest[3] += D; digest[4] += E; } EXPORT_SYMBOL(sha1_transform); /** * sha1_init - initialize the vectors for a SHA1 digest * @buf: vector to initialize */ void sha1_init(__u32 *buf) { buf[0] = 0x67452301; buf[1] = 0xefcdab89; buf[2] = 0x98badcfe; buf[3] = 0x10325476; buf[4] = 0xc3d2e1f0; } EXPORT_SYMBOL(sha1_init); MODULE_DESCRIPTION("SHA-1 Algorithm"); MODULE_LICENSE("GPL");
86 84 93 6 6 6 6 6 6 2 2 2 2 2 93 94 93 94 93 94 94 2 76 80 71 70 43 71 70 71 42 44 38 37 34 22 22 22 38 38 14 10 10 13 13 6 6 19 13 2 4 8 8 8 1 1 1 1 1 1 1 4 1 3 3 4 4 4 4 1 1 1 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 /* * * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * * Cgroup v2 * Copyright (C) 2019 Red Hat, Inc. * Author: Giuseppe Scrivano <gscrivan@redhat.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #include <linux/cgroup.h> #include <linux/page_counter.h> #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) /* Use t->m[0] to encode the offset */ #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0))) #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff) #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff) #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl) #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl) static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static struct cftype *dfl_files; static struct cftype *legacy_files; static inline struct page_counter * __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, bool rsvd) { if (rsvd) return &h_cg->rsvd_hugepage[idx]; return &h_cg->hugepage[idx]; } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) { return (h_cg == root_h_cgroup); } static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { struct hstate *h; for_each_hstate(h) { if (page_counter_read( hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) return true; } return false; } static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, struct hugetlb_cgroup *parent_h_cgroup) { int idx; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { struct page_counter *fault_parent = NULL; struct page_counter *rsvd_parent = NULL; unsigned long limit; int ret; if (parent_h_cgroup) { fault_parent = hugetlb_cgroup_counter_from_cgroup( parent_h_cgroup, idx); rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( parent_h_cgroup, idx); } page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), fault_parent, false); page_counter_init( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), rsvd_parent, false); limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), limit); VM_BUG_ON(ret); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), limit); VM_BUG_ON(ret); } } static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) { int node; for_each_node(node) kfree(h_cgroup->nodeinfo[node]); kfree(h_cgroup); } static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; int node; h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; /* * TODO: this routine can waste much memory for nodes which will * never be onlined. It's better to use memory hotplug callback * function. */ for_each_node(node) { /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ int node_to_alloc = node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; h_cgroup->nodeinfo[node] = kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), GFP_KERNEL, node_to_alloc); if (!h_cgroup->nodeinfo[node]) goto fail_alloc_nodeinfo; } hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; fail_alloc_nodeinfo: hugetlb_cgroup_free(h_cgroup); return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* * Should be called with hugetlb_lock held. * Since we are holding hugetlb_lock, pages cannot get moved from * active list or uncharged from the cgroup, So no need to get * page reference and test for page active here. This function * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ if (!hcg || hcg != h_cg) goto out; nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(folio, parent); out: return; } /* * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } cond_resched(); } while (hugetlb_cgroup_have_usage(h_cg)); } static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, enum hugetlb_memory_event event) { atomic_long_inc(&hugetlb->events_local[idx][event]); cgroup_file_notify(&hugetlb->events_local_file[idx]); do { atomic_long_inc(&hugetlb->events[idx][event]); cgroup_file_notify(&hugetlb->events_file[idx]); } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && !hugetlb_cgroup_is_root(hugetlb)); } static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr, bool rsvd) { int ret = 0; struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; if (hugetlb_cgroup_disabled()) goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); if (!page_counter_try_charge( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages, &counter)) { ret = -ENOMEM; hugetlb_event(h_cg, idx, HUGETLB_MAX); css_put(&h_cg->css); goto done; } /* Reservations take a reference to the css because they do not get * reparented. */ if (!rsvd) css_put(&h_cg->css); done: *ptr = h_cg; return ret; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); } int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); } /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; lockdep_assert_held(&hugetlb_lock); __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage + nr_pages); } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } /* * Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); else { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); } void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); } void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end) { if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || !resv->css) return; page_counter_uncharge(resv->reservation_counter, (end - start) * resv->pages_per_hpage); css_put(resv->css); } void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; if (rg->reservation_counter && resv->pages_per_hpage && !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); /* * Only do css_put(rg->css) when we delete the entire region * because one file_region must hold exactly one css reference. */ if (region_del) css_put(rg->css); } } enum { RES_USAGE, RES_RSVD_USAGE, RES_LIMIT, RES_RSVD_LIMIT, RES_MAX_USAGE, RES_RSVD_MAX_USAGE, RES_FAILCNT, RES_RSVD_FAILCNT, }; static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) { int nid; struct cftype *cft = seq_cft(seq); int idx = MEMFILE_IDX(cft->private); bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); struct cgroup_subsys_state *css; unsigned long usage; if (legacy) { /* Add up usage across all nodes for the non-hierarchical total. */ usage = 0; for_each_node_state(nid, N_MEMORY) usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); seq_printf(seq, "total=%lu", usage * PAGE_SIZE); /* Simply print the per-node usage for the non-hierarchical total. */ for_each_node_state(nid, N_MEMORY) seq_printf(seq, " N%d=%lu", nid, READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * PAGE_SIZE); seq_putc(seq, '\n'); } /* * The hierarchical total is pretty much the value recorded by the * counter, so use that. */ seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); /* * For each node, transverse the css tree to obtain the hierarchical * node usage. */ for_each_node_state(nid, N_MEMORY) { usage = 0; rcu_read_lock(); css_for_each_descendant_pre(css, &h_cg->css) { usage += READ_ONCE(hugetlb_cgroup_from_css(css) ->nodeinfo[nid] ->usage[idx]); } rcu_read_unlock(); seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); } seq_putc(seq, '\n'); return 0; } static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct page_counter *counter; struct page_counter *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_RSVD_USAGE: return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->max * PAGE_SIZE; case RES_RSVD_LIMIT: return (u64)rsvd_counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_RSVD_MAX_USAGE: return (u64)rsvd_counter->watermark * PAGE_SIZE; case RES_FAILCNT: return counter->failcnt; case RES_RSVD_FAILCNT: return rsvd_counter->failcnt; default: BUG(); } } static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) { int idx; u64 val; struct cftype *cft = seq_cft(seq); unsigned long limit; struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); counter = &h_cg->hugepage[idx]; limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; case RES_RSVD_LIMIT: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_LIMIT: val = (u64)counter->max; if (val == limit) seq_puts(seq, "max\n"); else seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; default: BUG(); } return 0; } static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, const char *max) { int ret, idx; unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); bool rsvd = false; if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ return -EINVAL; buf = strstrip(buf); ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; idx = MEMFILE_IDX(of_cft(of)->private); nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: rsvd = true; fallthrough; case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); ret = page_counter_set_max( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); } static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); } static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret = 0; struct page_counter *counter, *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: page_counter_reset_watermark(counter); break; case RES_RSVD_MAX_USAGE: page_counter_reset_watermark(rsvd_counter); break; case RES_FAILCNT: counter->failcnt = 0; break; case RES_RSVD_FAILCNT: rsvd_counter->failcnt = 0; break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) { if (hsize >= SZ_1G) snprintf(buf, size, "%luGB", hsize / SZ_1G); else if (hsize >= SZ_1M) snprintf(buf, size, "%luMB", hsize / SZ_1M); else snprintf(buf, size, "%luKB", hsize / SZ_1K); return buf; } static int __hugetlb_events_show(struct seq_file *seq, bool local) { int idx; long max; struct cftype *cft = seq_cft(seq); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); if (local) max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); else max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); seq_printf(seq, "max %lu\n", max); return 0; } static int hugetlb_events_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, false); } static int hugetlb_events_local_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, true); } static struct cftype hugetlb_dfl_tmpl[] = { { .name = "max", .private = RES_LIMIT, .seq_show = hugetlb_cgroup_read_u64_max, .write = hugetlb_cgroup_write_dfl, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "rsvd.max", .private = RES_RSVD_LIMIT, .seq_show = hugetlb_cgroup_read_u64_max, .write = hugetlb_cgroup_write_dfl, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "current", .private = RES_USAGE, .seq_show = hugetlb_cgroup_read_u64_max, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "rsvd.current", .private = RES_RSVD_USAGE, .seq_show = hugetlb_cgroup_read_u64_max, .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events", .seq_show = hugetlb_events_show, .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]), .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "events.local", .seq_show = hugetlb_events_local_show, .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]), .flags = CFTYPE_NOT_ON_ROOT, }, { .name = "numa_stat", .seq_show = hugetlb_cgroup_read_numa_stat, .flags = CFTYPE_NOT_ON_ROOT, }, /* don't need terminator here */ }; static struct cftype hugetlb_legacy_tmpl[] = { { .name = "limit_in_bytes", .private = RES_LIMIT, .read_u64 = hugetlb_cgroup_read_u64, .write = hugetlb_cgroup_write_legacy, }, { .name = "rsvd.limit_in_bytes", .private = RES_RSVD_LIMIT, .read_u64 = hugetlb_cgroup_read_u64, .write = hugetlb_cgroup_write_legacy, }, { .name = "usage_in_bytes", .private = RES_USAGE, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.usage_in_bytes", .private = RES_RSVD_USAGE, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "max_usage_in_bytes", .private = RES_MAX_USAGE, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.max_usage_in_bytes", .private = RES_RSVD_MAX_USAGE, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "failcnt", .private = RES_FAILCNT, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "rsvd.failcnt", .private = RES_RSVD_FAILCNT, .write = hugetlb_cgroup_reset, .read_u64 = hugetlb_cgroup_read_u64, }, { .name = "numa_stat", .seq_show = hugetlb_cgroup_read_numa_stat, }, /* don't need terminator here */ }; static void __init hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, struct cftype *tmpl, int tmpl_size) { char buf[32]; int i, idx = hstate_index(h); /* format the size */ mem_fmt(buf, sizeof(buf), huge_page_size(h)); for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { *cft = *tmpl; /* rebuild the name */ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); /* rebuild the private */ cft->private = MEMFILE_PRIVATE(idx, tmpl->private); /* rebuild the file_offset */ if (tmpl->file_offset) { unsigned int offset = tmpl->file_offset; cft->file_offset = MEMFILE_OFFSET0(offset) + MEMFILE_FIELD_SIZE(offset) * idx; } lockdep_register_key(&cft->lockdep_key); } } static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h) { int idx = hstate_index(h); hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE, hugetlb_dfl_tmpl, DFL_TMPL_SIZE); } static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h) { int idx = hstate_index(h); hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE, hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE); } static void __init __hugetlb_cgroup_file_init(struct hstate *h) { __hugetlb_cgroup_file_dfl_init(h); __hugetlb_cgroup_file_legacy_init(h); } static void __init __hugetlb_cgroup_file_pre_init(void) { int cft_count; cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */ dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); BUG_ON(!dfl_files); cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */ legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); BUG_ON(!legacy_files); } static void __init __hugetlb_cgroup_file_post_init(void) { WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, dfl_files)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, legacy_files)); } void __init hugetlb_cgroup_file_init(void) { struct hstate *h; __hugetlb_cgroup_file_pre_init(); for_each_hstate(h) __hugetlb_cgroup_file_init(h); __hugetlb_cgroup_file_post_init(); } /* * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_folio(old_folio); h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); set_hugetlb_cgroup(old_folio, NULL); set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(new_folio, h_cg); set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); } static struct cftype hugetlb_files[] = { {} /* terminate */ }; struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, .dfl_cftypes = hugetlb_files, .legacy_cftypes = hugetlb_files, };
13 13 86 9 80 100 101 101 80 194 145 146 98 95 67 101 142 74 207 187 187 54 58 4 54 15 95 95 95 6 36 95 8 36 94 95 90 37 90 95 95 62 36 34 2 36 35 36 7 13 16 36 4 12 14 9 14 93 22 95 5 90 14 105 4 43 95 95 95 95 95 22 104 46 67 67 22 67 67 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 // SPDX-License-Identifier: GPL-2.0-only /* * reservations.c * * Allocation reservations implementation * * Some code borrowed from fs/ext3/balloc.c and is: * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * The rest is copyright (C) 2010 Novell. All rights reserved. */ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> #include <linux/bitops.h> #include <linux/list.h> #include <cluster/masklog.h> #include "ocfs2.h" #include "ocfs2_trace.h" #ifdef CONFIG_OCFS2_DEBUG_FS #define OCFS2_CHECK_RESERVATIONS #endif static DEFINE_SPINLOCK(resv_lock); int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) { return (osb->osb_resv_level && osb->osb_dir_resv_level); } static unsigned int ocfs2_resv_window_bits(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv) { struct ocfs2_super *osb = resmap->m_osb; unsigned int bits; if (!(resv->r_flags & OCFS2_RESV_FLAG_DIR)) { /* 8, 16, 32, 64, 128, 256, 512, 1024 */ bits = 4 << osb->osb_resv_level; } else { bits = 4 << osb->osb_dir_resv_level; } return bits; } static inline unsigned int ocfs2_resv_end(struct ocfs2_alloc_reservation *resv) { if (resv->r_len) return resv->r_start + resv->r_len - 1; return resv->r_start; } static inline int ocfs2_resv_empty(struct ocfs2_alloc_reservation *resv) { return !!(resv->r_len == 0); } static inline int ocfs2_resmap_disabled(struct ocfs2_reservation_map *resmap) { if (resmap->m_osb->osb_resv_level == 0) return 1; return 0; } static void ocfs2_dump_resv(struct ocfs2_reservation_map *resmap) { struct ocfs2_super *osb = resmap->m_osb; struct rb_node *node; struct ocfs2_alloc_reservation *resv; int i = 0; mlog(ML_NOTICE, "Dumping resmap for device %s. Bitmap length: %u\n", osb->dev_str, resmap->m_bitmap_len); node = rb_first(&resmap->m_reservations); while (node) { resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); mlog(ML_NOTICE, "start: %u\tend: %u\tlen: %u\tlast_start: %u" "\tlast_len: %u\n", resv->r_start, ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, resv->r_last_len); node = rb_next(node); i++; } mlog(ML_NOTICE, "%d reservations found. LRU follows\n", i); i = 0; list_for_each_entry(resv, &resmap->m_lru, r_lru) { mlog(ML_NOTICE, "LRU(%d) start: %u\tend: %u\tlen: %u\t" "last_start: %u\tlast_len: %u\n", i, resv->r_start, ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, resv->r_last_len); i++; } } #ifdef OCFS2_CHECK_RESERVATIONS static int ocfs2_validate_resmap_bits(struct ocfs2_reservation_map *resmap, int i, struct ocfs2_alloc_reservation *resv) { char *disk_bitmap = resmap->m_disk_bitmap; unsigned int start = resv->r_start; unsigned int end = ocfs2_resv_end(resv); while (start <= end) { if (ocfs2_test_bit(start, disk_bitmap)) { mlog(ML_ERROR, "reservation %d covers an allocated area " "starting at bit %u!\n", i, start); return 1; } start++; } return 0; } static void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) { unsigned int off = 0; int i = 0; struct rb_node *node; struct ocfs2_alloc_reservation *resv; node = rb_first(&resmap->m_reservations); while (node) { resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); if (i > 0 && resv->r_start <= off) { mlog(ML_ERROR, "reservation %d has bad start off!\n", i); goto bad; } if (resv->r_len == 0) { mlog(ML_ERROR, "reservation %d has no length!\n", i); goto bad; } if (resv->r_start > ocfs2_resv_end(resv)) { mlog(ML_ERROR, "reservation %d has invalid range!\n", i); goto bad; } if (ocfs2_resv_end(resv) >= resmap->m_bitmap_len) { mlog(ML_ERROR, "reservation %d extends past bitmap!\n", i); goto bad; } if (ocfs2_validate_resmap_bits(resmap, i, resv)) goto bad; off = ocfs2_resv_end(resv); node = rb_next(node); i++; } return; bad: ocfs2_dump_resv(resmap); BUG(); } #else static inline void ocfs2_check_resmap(struct ocfs2_reservation_map *resmap) { } #endif void ocfs2_resv_init_once(struct ocfs2_alloc_reservation *resv) { memset(resv, 0, sizeof(*resv)); INIT_LIST_HEAD(&resv->r_lru); } void ocfs2_resv_set_type(struct ocfs2_alloc_reservation *resv, unsigned int flags) { BUG_ON(flags & ~OCFS2_RESV_TYPES); resv->r_flags |= flags; } void ocfs2_resmap_init(struct ocfs2_super *osb, struct ocfs2_reservation_map *resmap) { memset(resmap, 0, sizeof(*resmap)); resmap->m_osb = osb; resmap->m_reservations = RB_ROOT; /* m_bitmap_len is initialized to zero by the above memset. */ INIT_LIST_HEAD(&resmap->m_lru); } static void ocfs2_resv_mark_lru(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv) { assert_spin_locked(&resv_lock); if (!list_empty(&resv->r_lru)) list_del_init(&resv->r_lru); list_add_tail(&resv->r_lru, &resmap->m_lru); } static void __ocfs2_resv_trunc(struct ocfs2_alloc_reservation *resv) { resv->r_len = 0; resv->r_start = 0; } static void ocfs2_resv_remove(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv) { if (resv->r_flags & OCFS2_RESV_FLAG_INUSE) { list_del_init(&resv->r_lru); rb_erase(&resv->r_node, &resmap->m_reservations); resv->r_flags &= ~OCFS2_RESV_FLAG_INUSE; } } static void __ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv) { assert_spin_locked(&resv_lock); __ocfs2_resv_trunc(resv); /* * last_len and last_start no longer make sense if * we're changing the range of our allocations. */ resv->r_last_len = resv->r_last_start = 0; ocfs2_resv_remove(resmap, resv); } /* does nothing if 'resv' is null */ void ocfs2_resv_discard(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv) { if (resv) { spin_lock(&resv_lock); __ocfs2_resv_discard(resmap, resv); spin_unlock(&resv_lock); } } static void ocfs2_resmap_clear_all_resv(struct ocfs2_reservation_map *resmap) { struct rb_node *node; struct ocfs2_alloc_reservation *resv; assert_spin_locked(&resv_lock); while ((node = rb_last(&resmap->m_reservations)) != NULL) { resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); __ocfs2_resv_discard(resmap, resv); } } void ocfs2_resmap_restart(struct ocfs2_reservation_map *resmap, unsigned int clen, char *disk_bitmap) { if (ocfs2_resmap_disabled(resmap)) return; spin_lock(&resv_lock); ocfs2_resmap_clear_all_resv(resmap); resmap->m_bitmap_len = clen; resmap->m_disk_bitmap = disk_bitmap; spin_unlock(&resv_lock); } void ocfs2_resmap_uninit(struct ocfs2_reservation_map *resmap) { /* Does nothing for now. Keep this around for API symmetry */ } static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *new) { struct rb_root *root = &resmap->m_reservations; struct rb_node *parent = NULL; struct rb_node **p = &root->rb_node; struct ocfs2_alloc_reservation *tmp; assert_spin_locked(&resv_lock); trace_ocfs2_resv_insert(new->r_start, new->r_len); while (*p) { parent = *p; tmp = rb_entry(parent, struct ocfs2_alloc_reservation, r_node); if (new->r_start < tmp->r_start) { p = &(*p)->rb_left; /* * This is a good place to check for * overlapping reservations. */ BUG_ON(ocfs2_resv_end(new) >= tmp->r_start); } else if (new->r_start > ocfs2_resv_end(tmp)) { p = &(*p)->rb_right; } else { /* This should never happen! */ mlog(ML_ERROR, "Duplicate reservation window!\n"); BUG(); } } rb_link_node(&new->r_node, parent, p); rb_insert_color(&new->r_node, root); new->r_flags |= OCFS2_RESV_FLAG_INUSE; ocfs2_resv_mark_lru(resmap, new); ocfs2_check_resmap(resmap); } /** * ocfs2_find_resv_lhs() - find the window which contains goal * @resmap: reservation map to search * @goal: which bit to search for * * If a window containing that goal is not found, we return the window * which comes before goal. Returns NULL on empty rbtree or no window * before goal. */ static struct ocfs2_alloc_reservation * ocfs2_find_resv_lhs(struct ocfs2_reservation_map *resmap, unsigned int goal) { struct ocfs2_alloc_reservation *resv = NULL; struct ocfs2_alloc_reservation *prev_resv = NULL; struct rb_node *node = resmap->m_reservations.rb_node; assert_spin_locked(&resv_lock); if (!node) return NULL; node = rb_first(&resmap->m_reservations); while (node) { resv = rb_entry(node, struct ocfs2_alloc_reservation, r_node); if (resv->r_start <= goal && ocfs2_resv_end(resv) >= goal) break; /* Check if we overshot the reservation just before goal? */ if (resv->r_start > goal) { resv = prev_resv; break; } prev_resv = resv; node = rb_next(node); } return resv; } /* * We are given a range within the bitmap, which corresponds to a gap * inside the reservations tree (search_start, search_len). The range * can be anything from the whole bitmap, to a gap between * reservations. * * The start value of *rstart is insignificant. * * This function searches the bitmap range starting at search_start * with length search_len for a set of contiguous free bits. We try * to find up to 'wanted' bits, but can sometimes return less. * * Returns the length of allocation, 0 if no free bits are found. * * *cstart and *clen will also be populated with the result. */ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, unsigned int wanted, unsigned int search_start, unsigned int search_len, unsigned int *rstart, unsigned int *rlen) { void *bitmap = resmap->m_disk_bitmap; unsigned int best_start, best_len = 0; int offset, start, found; trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len, wanted, resmap->m_bitmap_len); found = best_start = best_len = 0; start = search_start; while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, start)) < resmap->m_bitmap_len) { /* Search reached end of the region */ if (offset >= (search_start + search_len)) break; if (offset == start) { /* we found a zero */ found++; /* move start to the next bit to test */ start++; } else { /* got a zero after some ones */ found = 1; start = offset + 1; } if (found > best_len) { best_len = found; best_start = start - found; } if (found >= wanted) break; } if (best_len == 0) return 0; if (best_len >= wanted) best_len = wanted; *rlen = best_len; *rstart = best_start; trace_ocfs2_resmap_find_free_bits_end(best_start, best_len); return *rlen; } static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, unsigned int goal, unsigned int wanted) { struct rb_root *root = &resmap->m_reservations; unsigned int gap_start, gap_end, gap_len; struct ocfs2_alloc_reservation *prev_resv, *next_resv; struct rb_node *prev, *next; unsigned int cstart, clen; unsigned int best_start = 0, best_len = 0; /* * Nasty cases to consider: * * - rbtree is empty * - our window should be first in all reservations * - our window should be last in all reservations * - need to make sure we don't go past end of bitmap */ trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv), goal, wanted, RB_EMPTY_ROOT(root)); assert_spin_locked(&resv_lock); if (RB_EMPTY_ROOT(root)) { /* * Easiest case - empty tree. We can just take * whatever window of free bits we want. */ clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, resmap->m_bitmap_len - goal, &cstart, &clen); /* * This should never happen - the local alloc window * will always have free bits when we're called. */ BUG_ON(goal == 0 && clen == 0); if (clen == 0) return; resv->r_start = cstart; resv->r_len = clen; ocfs2_resv_insert(resmap, resv); return; } prev_resv = ocfs2_find_resv_lhs(resmap, goal); if (prev_resv == NULL) { /* * A NULL here means that the search code couldn't * find a window that starts before goal. * * However, we can take the first window after goal, * which is also by definition, the leftmost window in * the entire tree. If we can find free bits in the * gap between goal and the LHS window, then the * reservation can safely be placed there. * * Otherwise we fall back to a linear search, checking * the gaps in between windows for a place to * allocate. */ next = rb_first(root); next_resv = rb_entry(next, struct ocfs2_alloc_reservation, r_node); /* * The search should never return such a window. (see * comment above */ if (next_resv->r_start <= goal) { mlog(ML_ERROR, "goal: %u next_resv: start %u len %u\n", goal, next_resv->r_start, next_resv->r_len); ocfs2_dump_resv(resmap); BUG(); } clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, next_resv->r_start - goal, &cstart, &clen); if (clen) { best_len = clen; best_start = cstart; if (best_len == wanted) goto out_insert; } prev_resv = next_resv; next_resv = NULL; } trace_ocfs2_resv_find_window_prev(prev_resv->r_start, ocfs2_resv_end(prev_resv)); prev = &prev_resv->r_node; /* Now we do a linear search for a window, starting at 'prev_rsv' */ while (1) { next = rb_next(prev); if (next) { next_resv = rb_entry(next, struct ocfs2_alloc_reservation, r_node); gap_start = ocfs2_resv_end(prev_resv) + 1; gap_end = next_resv->r_start - 1; gap_len = gap_end - gap_start + 1; } else { /* * We're at the rightmost edge of the * tree. See if a reservation between this * window and the end of the bitmap will work. */ gap_start = ocfs2_resv_end(prev_resv) + 1; gap_len = resmap->m_bitmap_len - gap_start; gap_end = resmap->m_bitmap_len - 1; } trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1, next ? ocfs2_resv_end(next_resv) : -1); /* * No need to check this gap if we have already found * a larger region of free bits. */ if (gap_len <= best_len) goto next_resv; clen = ocfs2_resmap_find_free_bits(resmap, wanted, gap_start, gap_len, &cstart, &clen); if (clen == wanted) { best_len = clen; best_start = cstart; goto out_insert; } else if (clen > best_len) { best_len = clen; best_start = cstart; } next_resv: if (!next) break; prev = next; prev_resv = rb_entry(prev, struct ocfs2_alloc_reservation, r_node); } out_insert: if (best_len) { resv->r_start = best_start; resv->r_len = best_len; ocfs2_resv_insert(resmap, resv); } } static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, unsigned int wanted) { struct ocfs2_alloc_reservation *lru_resv; int tmpwindow = !!(resv->r_flags & OCFS2_RESV_FLAG_TMP); unsigned int min_bits; if (!tmpwindow) min_bits = ocfs2_resv_window_bits(resmap, resv) >> 1; else min_bits = wanted; /* We at know the temp window will use all * of these bits */ /* * Take the first reservation off the LRU as our 'target'. We * don't try to be smart about it. There might be a case for * searching based on size but I don't have enough data to be * sure. --Mark (3/16/2010) */ lru_resv = list_first_entry(&resmap->m_lru, struct ocfs2_alloc_reservation, r_lru); trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start, lru_resv->r_len, ocfs2_resv_end(lru_resv)); /* * Cannibalize (some or all) of the target reservation and * feed it to the current window. */ if (lru_resv->r_len <= min_bits) { /* * Discard completely if size is less than or equal to a * reasonable threshold - 50% of window bits for non temporary * windows. */ resv->r_start = lru_resv->r_start; resv->r_len = lru_resv->r_len; __ocfs2_resv_discard(resmap, lru_resv); } else { unsigned int shrink; if (tmpwindow) shrink = min_bits; else shrink = lru_resv->r_len / 2; lru_resv->r_len -= shrink; resv->r_start = ocfs2_resv_end(lru_resv) + 1; resv->r_len = shrink; } trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, resv->r_last_len); ocfs2_resv_insert(resmap, resv); } static void ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, unsigned int wanted) { unsigned int goal = 0; BUG_ON(!ocfs2_resv_empty(resv)); /* * Begin by trying to get a window as close to the previous * one as possible. Using the most recent allocation as a * start goal makes sense. */ if (resv->r_last_len) { goal = resv->r_last_start + resv->r_last_len; if (goal >= resmap->m_bitmap_len) goal = 0; } __ocfs2_resv_find_window(resmap, resv, goal, wanted); /* Search from last alloc didn't work, try once more from beginning. */ if (ocfs2_resv_empty(resv) && goal != 0) __ocfs2_resv_find_window(resmap, resv, 0, wanted); if (ocfs2_resv_empty(resv)) { /* * Still empty? Pull oldest one off the LRU, remove it from * tree, put this one in it's place. */ ocfs2_cannibalize_resv(resmap, resv, wanted); } BUG_ON(ocfs2_resv_empty(resv)); } int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, int *cstart, int *clen) { if (resv == NULL || ocfs2_resmap_disabled(resmap)) return -ENOSPC; spin_lock(&resv_lock); if (ocfs2_resv_empty(resv)) { /* * We don't want to over-allocate for temporary * windows. Otherwise, we run the risk of fragmenting the * allocation space. */ unsigned int wanted = ocfs2_resv_window_bits(resmap, resv); if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) wanted = *clen; /* * Try to get a window here. If it works, we must fall * through and test the bitmap . This avoids some * ping-ponging of windows due to non-reserved space * being allocation before we initialize a window for * that inode. */ ocfs2_resv_find_window(resmap, resv, wanted); trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len); } BUG_ON(ocfs2_resv_empty(resv)); *cstart = resv->r_start; *clen = resv->r_len; spin_unlock(&resv_lock); return 0; } static void ocfs2_adjust_resv_from_alloc(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, unsigned int start, unsigned int end) { unsigned int rhs = 0; unsigned int old_end = ocfs2_resv_end(resv); BUG_ON(start != resv->r_start || old_end < end); /* * Completely used? We can remove it then. */ if (old_end == end) { __ocfs2_resv_discard(resmap, resv); return; } rhs = old_end - end; /* * This should have been trapped above. */ BUG_ON(rhs == 0); resv->r_start = end + 1; resv->r_len = old_end - resv->r_start + 1; } void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap, struct ocfs2_alloc_reservation *resv, u32 cstart, u32 clen) { unsigned int cend = cstart + clen - 1; if (resmap == NULL || ocfs2_resmap_disabled(resmap)) return; if (resv == NULL) return; BUG_ON(cstart != resv->r_start); spin_lock(&resv_lock); trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, resv->r_last_len); BUG_ON(cstart < resv->r_start); BUG_ON(cstart > ocfs2_resv_end(resv)); BUG_ON(cend > ocfs2_resv_end(resv)); ocfs2_adjust_resv_from_alloc(resmap, resv, cstart, cend); resv->r_last_start = cstart; resv->r_last_len = clen; /* * May have been discarded above from * ocfs2_adjust_resv_from_alloc(). */ if (!ocfs2_resv_empty(resv)) ocfs2_resv_mark_lru(resmap, resv); trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv), resv->r_len, resv->r_last_start, resv->r_last_len); ocfs2_check_resmap(resmap); spin_unlock(&resv_lock); }
133 128 93 1 197 1 133 194 126 95 270 39 5 217 9 1 38 2 2 164 1 1 220 1 23 23 14 9 345 1 2 348 2 19 3 6 282 283 3 43 27 32 32 1 31 11 12 9 3 12 1 8 4 19 1 22 2 45 26 20 38 43 47 47 47 47 47 47 47 47 20 47 42 41 42 41 1 36 6 31 142 143 1 1 2 139 115 1 26 37 51 114 140 283 281 284 284 281 1664 1651 227 388 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_queues.h> #include <net/sock.h> #include <linux/ethtool_netlink.h> #include <linux/phy_link_topology.h> #include <linux/pm_runtime.h> #include "netlink.h" #include "module_fw.h" static struct genl_family ethtool_genl_family; static bool ethnl_ok __read_mostly; static u32 ethnl_bcast_seq; #define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS | \ ETHTOOL_FLAG_OMIT_REPLY) #define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS) const struct nla_policy ethnl_header_policy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), }; const struct nla_policy ethnl_header_policy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), }; const struct nla_policy ethnl_header_policy_phy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; const struct nla_policy ethnl_header_policy_phy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type) { struct ethnl_sock_priv *sk_priv; sk_priv = genl_sk_priv_get(&ethtool_genl_family, NETLINK_CB(skb).sk); if (IS_ERR(sk_priv)) return PTR_ERR(sk_priv); sk_priv->dev = dev; sk_priv->portid = portid; sk_priv->type = type; return 0; } static void ethnl_sock_priv_destroy(void *priv) { struct ethnl_sock_priv *sk_priv = priv; switch (sk_priv->type) { case ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH: ethnl_module_fw_flash_sock_destroy(sk_priv); break; default: break; } } int ethnl_ops_begin(struct net_device *dev) { int ret; if (!dev) return -ENODEV; if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); if (!netif_device_present(dev) || dev->reg_state >= NETREG_UNREGISTERING) { ret = -ENODEV; goto err; } if (dev->ethtool_ops->begin) { ret = dev->ethtool_ops->begin(dev); if (ret) goto err; } return 0; err: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); return ret; } void ethnl_ops_complete(struct net_device *dev) { if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (dev->dev.parent) pm_runtime_put(dev->dev.parent); } /** * ethnl_parse_header_dev_get() - parse request header * @req_info: structure to put results into * @header: nest attribute with request header * @net: request netns * @extack: netlink extack for error reporting * @require_dev: fail if no device identified in header * * Parse request header in nested attribute @nest and puts results into * the structure pointed to by @req_info. Extack from @info is used for error * reporting. If req_info->dev is not null on return, reference to it has * been taken. If error is returned, *req_info is null initialized and no * reference is held. * * Return: 0 on success or negative error code */ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *header, struct net *net, struct netlink_ext_ack *extack, bool require_dev) { struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)]; const struct nlattr *devname_attr; struct net_device *dev = NULL; u32 flags = 0; int ret; if (!header) { if (!require_dev) return 0; NL_SET_ERR_MSG(extack, "request header missing"); return -EINVAL; } /* No validation here, command policy should have a nested policy set * for the header, therefore validation should have already been done. */ ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header, NULL, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_HEADER_FLAGS]) flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_DEV_INDEX], "no device matches ifindex"); return -ENODEV; } /* if both ifindex and ifname are passed, they must match */ if (devname_attr && strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { netdev_put(dev, &req_info->dev_tracker); NL_SET_ERR_MSG_ATTR(extack, header, "ifindex and name do not match"); return -ENODEV; } } else if (devname_attr) { dev = netdev_get_by_name(net, nla_data(devname_attr), &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, devname_attr, "no device matches name"); return -ENODEV; } } else if (require_dev) { NL_SET_ERR_MSG_ATTR(extack, header, "neither ifindex nor name specified"); return -EINVAL; } if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { if (dev) { req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]); } else { NL_SET_ERR_MSG_ATTR(extack, header, "phy_index set without a netdev"); return -EINVAL; } } req_info->dev = dev; req_info->flags = flags; return 0; } struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, const struct nlattr *header, struct netlink_ext_ack *extack) { struct phy_device *phydev; ASSERT_RTNL(); if (!req_info->dev) return NULL; if (!req_info->phy_index) return req_info->dev->phydev; phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index); if (!phydev) { NL_SET_ERR_MSG_ATTR(extack, header, "no phy matching phyindex"); return ERR_PTR(-ENODEV); } return phydev; } /** * ethnl_fill_reply_header() - Put common header into a reply message * @skb: skb with the message * @dev: network device to describe in header * @attrtype: attribute type to use for the nest * * Create a nested attribute with attributes describing given network device. * * Return: 0 on success, error value (-EMSGSIZE only) on error */ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype) { struct nlattr *nest; if (!dev) return 0; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) || nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name)) goto nla_put_failure; /* If more attributes are put into reply header, ethnl_header_size() * must be updated to account for them. */ nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } /** * ethnl_reply_init() - Create skb for a reply and fill device identification * @payload: payload length (without netlink and genetlink header) * @dev: device the reply is about (may be null) * @cmd: ETHTOOL_MSG_* message type for reply * @hdr_attrtype: attribute type for common header * @info: genetlink info of the received packet we respond to * @ehdrp: place to store payload pointer returned by genlmsg_new() * * Return: pointer to allocated skb on success, NULL on error */ struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp) { struct sk_buff *skb; skb = genlmsg_new(payload, GFP_KERNEL); if (!skb) goto err; *ehdrp = genlmsg_put_reply(skb, info, &ethtool_genl_family, 0, cmd); if (!*ehdrp) goto err_free; if (dev) { int ret; ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype); if (ret < 0) goto err_free; } return skb; err_free: nlmsg_free(skb); err: if (info) GENL_SET_ERR_MSG(info, "failed to setup reply message"); return NULL; } void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) { return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ethtool_genl_family, 0, cmd); } void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0, cmd); } void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd) { return genlmsg_put(skb, portid, seq, &ethtool_genl_family, 0, cmd); } int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) { return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb, 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); } /* GET request helpers */ /** * struct ethnl_dump_ctx - context structure for generic dumpit() callback * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used * in ethnl_default_dumpit() and ethnl_default_done(). */ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; unsigned long pos_ifindex; }; static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = &ethnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKINFO_SET] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKMODES_SET] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = &ethnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = &ethnl_debug_request_ops, [ETHTOOL_MSG_DEBUG_SET] = &ethnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = &ethnl_wol_request_ops, [ETHTOOL_MSG_WOL_SET] = &ethnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = &ethnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_PRIVFLAGS_SET] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = &ethnl_rings_request_ops, [ETHTOOL_MSG_RINGS_SET] = &ethnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_GET] = &ethnl_channels_request_ops, [ETHTOOL_MSG_CHANNELS_SET] = &ethnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_GET] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_COALESCE_SET] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = &ethnl_pause_request_ops, [ETHTOOL_MSG_PAUSE_SET] = &ethnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = &ethnl_eee_request_ops, [ETHTOOL_MSG_EEE_SET] = &ethnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = &ethnl_fec_request_ops, [ETHTOOL_MSG_FEC_SET] = &ethnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = &ethnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = &ethnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = &ethnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = &ethnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = &ethnl_module_request_ops, [ETHTOOL_MSG_MODULE_SET] = &ethnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = &ethnl_pse_request_ops, [ETHTOOL_MSG_PSE_SET] = &ethnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = &ethnl_rss_request_ops, [ETHTOOL_MSG_PLCA_GET_CFG] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_SET_CFG] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_GET_STATUS] = &ethnl_plca_status_request_ops, [ETHTOOL_MSG_MM_GET] = &ethnl_mm_request_ops, [ETHTOOL_MSG_MM_SET] = &ethnl_mm_request_ops, [ETHTOOL_MSG_TSCONFIG_GET] = &ethnl_tsconfig_request_ops, [ETHTOOL_MSG_TSCONFIG_SET] = &ethnl_tsconfig_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) { return (struct ethnl_dump_ctx *)cb->ctx; } /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into * @info: genl_info from the request * @request_ops: struct request_ops for request type * @require_dev: fail if no device identified in header * * Parse universal request header and call request specific ->parse_request() * callback (if defined) to parse the rest of the message. * * Return: 0 on success or negative error code */ static int ethnl_default_parse(struct ethnl_req_info *req_info, const struct genl_info *info, const struct ethnl_request_ops *request_ops, bool require_dev) { struct nlattr **tb = info->attrs; int ret; ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr], genl_info_net(info), info->extack, require_dev); if (ret < 0) return ret; if (request_ops->parse_request) { ret = request_ops->parse_request(req_info, tb, info->extack); if (ret < 0) return ret; } return 0; } /** * ethnl_init_reply_data() - Initialize reply data for GET request * @reply_data: pointer to embedded struct ethnl_reply_data * @ops: instance of struct ethnl_request_ops describing the layout * @dev: network device to initialize the reply for * * Fills the reply data part with zeros and sets the dev member. Must be called * before calling the ->fill_reply() callback (for each iteration when handling * dump requests). */ static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data, const struct ethnl_request_ops *ops, struct net_device *dev) { memset(reply_data, 0, ops->reply_data_size); reply_data->dev = dev; } /* default ->doit() handler for GET type requests */ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) { struct ethnl_reply_data *reply_data = NULL; struct ethnl_req_info *req_info = NULL; const u8 cmd = info->genlhdr->cmd; const struct ethnl_request_ops *ops; int hdr_len, reply_len; struct sk_buff *rskb; void *reply_payload; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return -ENOMEM; } ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do); if (ret < 0) goto err_dev; ethnl_init_reply_data(reply_data, ops, req_info->dev); rtnl_lock(); ret = ops->prepare_data(req_info, reply_data, info); rtnl_unlock(); if (ret < 0) goto err_cleanup; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(), req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); if (!rskb) goto err_cleanup; hdr_len = rskb->len; ret = ops->fill_reply(rskb, req_info, reply_data); if (ret < 0) goto err_msg; WARN_ONCE(rskb->len - hdr_len > reply_len, "ethnl cmd %d: calculated reply length %d, but consumed %d\n", cmd, reply_len, rskb->len - hdr_len); if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(rskb, reply_payload); netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return genlmsg_reply(rskb, info); err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); nlmsg_free(rskb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_dev: netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return ret; } static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, const struct ethnl_dump_ctx *ctx, const struct genl_info *info) { void *ehdr; int ret; ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ethtool_genl_family, NLM_F_MULTI, ctx->ops->reply_cmd); if (!ehdr) return -EMSGSIZE; ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); rtnl_unlock(); if (ret < 0) goto out; ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); if (ret < 0) goto out; ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data); out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); ctx->reply_data->dev = NULL; if (ret < 0) genlmsg_cancel(skb, ehdr); else genlmsg_end(skb, ehdr); return ret; } /* Default ->dumpit() handler for GET requests. */ static int ethnl_default_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { dev_hold(dev); rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rcu_read_lock(); dev_put(dev); if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) ret = skb->len; break; } ret = 0; } rcu_read_unlock(); return ret; } /* generic ->start() handler for GET requests */ static int ethnl_default_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genlmsghdr *ghdr; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); ghdr = nlmsg_data(cb->nlh); ops = ethnl_default_requests[ghdr->cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) return -EOPNOTSUPP; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { /* We ignore device specification in dump requests but as the * same parser as for non-dump (doit) requests is used, it * would take reference to the device if it finds one */ netdev_put(req_info->dev, &req_info->dev_tracker); req_info->dev = NULL; } if (ret < 0) goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } /* default ->done() handler for GET requests */ static int ethnl_default_done(struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); kfree(ctx->reply_data); kfree(ctx->req_info); return 0; } static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct ethnl_request_ops *ops; struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; struct net_device *dev; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr], genl_info_net(info), info->extack, true); if (ret < 0) return ret; if (ops->set_validate) { ret = ops->set_validate(&req_info, info); /* 0 means nothing to do */ if (ret <= 0) goto out_dev; } dev = req_info.dev; rtnl_lock(); dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); if (!dev->cfg_pending) { ret = -ENOMEM; goto out_tie_cfg; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out_free_cfg; ret = ops->set(&req_info, info); if (ret < 0) goto out_ops; swap(dev->cfg, dev->cfg_pending); if (!ret) goto out_ops; ethtool_notify(dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: ethnl_ops_complete(dev); out_free_cfg: kfree(dev->cfg_pending); out_tie_cfg: dev->cfg_pending = dev->cfg; rtnl_unlock(); out_dev: ethnl_parse_header_dev_put(&req_info); return ret; } static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_NTF] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_DEBUG_NTF] = &ethnl_debug_request_ops, [ETHTOOL_MSG_WOL_NTF] = &ethnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_NTF] = &ethnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_NTF] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_NTF] = &ethnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_NTF] = &ethnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_NTF] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = &ethnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = &ethnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = &ethnl_fec_request_ops, [ETHTOOL_MSG_MODULE_NTF] = &ethnl_module_request_ops, [ETHTOOL_MSG_PLCA_NTF] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_MM_NTF] = &ethnl_mm_request_ops, }; /* default notification handler */ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, const void *data) { struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genl_info info; struct sk_buff *skb; void *reply_payload; int reply_len; int ret; genl_info_init_ntf(&info, &ethtool_genl_family, cmd); if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || !ethnl_default_notify_ops[cmd], "unexpected notification type %u\n", cmd)) return; ops = ethnl_default_notify_ops[cmd]; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return; } req_info->dev = dev; req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; ethnl_init_reply_data(reply_data, ops, dev); ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) goto err_cleanup; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret + ethnl_reply_header_size(); skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) goto err_cleanup; reply_payload = ethnl_bcastmsg_put(skb, cmd); if (!reply_payload) goto err_skb; ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr); if (ret < 0) goto err_msg; ret = ops->fill_reply(skb, req_info, reply_data); if (ret < 0) goto err_msg; if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(skb, reply_payload); kfree(reply_data); kfree(req_info); ethnl_multicast(skb, dev); return; err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); err_skb: nlmsg_free(skb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); kfree(reply_data); kfree(req_info); return; } /* notifications */ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, const void *data); static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) { if (unlikely(!ethnl_ok)) return; ASSERT_RTNL(); if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && ethnl_notify_handlers[cmd])) ethnl_notify_handlers[cmd](dev, cmd, data); else WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", cmd, netdev_name(dev)); } EXPORT_SYMBOL(ethtool_notify); static void ethnl_notify_features(struct netdev_notifier_info *info) { struct net_device *dev = netdev_notifier_info_to_dev(info); ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); } static int ethnl_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *info = ptr; struct netlink_ext_ack *extack; struct net_device *dev; dev = netdev_notifier_info_to_dev(info); extack = netdev_notifier_info_to_extack(info); switch (event) { case NETDEV_FEAT_CHANGE: ethnl_notify_features(ptr); break; case NETDEV_PRE_UP: if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Can't set port up while flashing module firmware"); return NOTIFY_BAD; } } return NOTIFY_DONE; } static struct notifier_block ethnl_netdev_notifier = { .notifier_call = ethnl_netdev_event, }; /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_STRSET_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_strset_get_policy, .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkinfo_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkmodes_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkmodes_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKSTATE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkstate_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_debug_get_policy, .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_debug_set_policy, .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_wol_get_policy, .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_wol_set_policy, .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_features_get_policy, .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_features, .policy = ethnl_features_set_policy, .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_privflags_get_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_privflags_set_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_rings_get_policy, .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_rings_set_policy, .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_channels_get_policy, .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_channels_set_policy, .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_coalesce_get_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_coalesce_set_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_pause_get_policy, .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pause_set_policy, .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_eee_get_policy, .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_eee_set_policy, .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSINFO_GET, .doit = ethnl_default_doit, .start = ethnl_tsinfo_start, .dumpit = ethnl_tsinfo_dumpit, .done = ethnl_tsinfo_done, .policy = ethnl_tsinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test, .policy = ethnl_cable_test_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, .policy = ethnl_cable_test_tdr_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, .doit = ethnl_tunnel_info_doit, .start = ethnl_tunnel_info_start, .dumpit = ethnl_tunnel_info_dumpit, .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_fec_get_policy, .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_eeprom_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_STATS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_stats_get_policy, .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_phc_vclocks_get_policy, .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_module_set_policy, .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_pse_get_policy, .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RSS_GET, .doit = ethnl_default_doit, .start = ethnl_rss_dump_start, .dumpit = ethnl_rss_dumpit, .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_CFG, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_plca_get_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_SET_CFG, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_plca_set_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_plca_get_status_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_mm_get_policy, .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_mm_set_policy, .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_FW_FLASH_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_module_fw_flash, .policy = ethnl_module_fw_flash_act_policy, .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHY_GET, .doit = ethnl_phy_doit, .start = ethnl_phy_start, .dumpit = ethnl_phy_dumpit, .done = ethnl_phy_done, .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSCONFIG_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_tsconfig_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSCONFIG_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_tsconfig_set_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME }, }; static struct genl_family ethtool_genl_family __ro_after_init = { .name = ETHTOOL_GENL_NAME, .version = ETHTOOL_GENL_VERSION, .netnsok = true, .parallel_ops = true, .ops = ethtool_genl_ops, .n_ops = ARRAY_SIZE(ethtool_genl_ops), .resv_start_op = ETHTOOL_MSG_MODULE_GET + 1, .mcgrps = ethtool_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps), .sock_priv_size = sizeof(struct ethnl_sock_priv), .sock_priv_destroy = ethnl_sock_priv_destroy, }; /* module setup */ static int __init ethnl_init(void) { int ret; ret = genl_register_family(&ethtool_genl_family); if (WARN(ret < 0, "ethtool: genetlink family registration failed")) return ret; ethnl_ok = true; ret = register_netdevice_notifier(&ethnl_netdev_notifier); WARN(ret < 0, "ethtool: net device notifier registration failed"); return ret; } subsys_initcall(ethnl_init);
4 3485 653 3 5549 4081 4081 4 3 3 586 586 3 244 6 360 878 965 1 4 3 752 752 5 2 2 3151 1877 581 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"}, \ {I_LINKABLE, "I_LINKABLE"}, \ {I_WB_SWITCH, "I_WB_SWITCH"}, \ {I_OVL_INUSE, "I_OVL_INUSE"}, \ {I_CREATING, "I_CREATING"}, \ {I_DONTCACHE, "I_DONTCACHE"}, \ {I_SYNC_QUEUED, "I_SYNC_QUEUED"}, \ {I_PINNING_NETFS_WB, "I_PINNING_NETFS_WB"}, \ {I_LRU_ISOLATING, "I_LRU_ISOLATING"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_folio_template, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct folio *folio, struct bdi_writeback *wb), TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, bdi_setpoint) __field(unsigned long, bdi_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); __entry->bdi_dirty = bdi_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->bdi_setpoint, __entry->bdi_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BITOPS_H #define _ASM_X86_BITOPS_H /* * Copyright 1992, Linus Torvalds. * * Note: inlines with more than a single statement should be marked * __always_inline to avoid problems with older gcc's inlining heuristics. */ #ifndef _LINUX_BITOPS_H #error only <linux/bitops.h> can be included directly #endif #include <linux/compiler.h> #include <asm/alternative.h> #include <asm/rmwcc.h> #include <asm/barrier.h> #if BITS_PER_LONG == 32 # define _BITOPS_LONG_SHIFT 5 #elif BITS_PER_LONG == 64 # define _BITOPS_LONG_SHIFT 6 #else # error "Unexpected BITS_PER_LONG" #endif #define BIT_64(n) (U64_C(1) << (n)) /* * These have to be done with inline assembly: that way the bit-setting * is guaranteed to be atomic. All bit operations return 0 if the bit * was cleared before the operation and != 0 if it was not. * * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ #define RLONG_ADDR(x) "m" (*(volatile long *) (x)) #define WBYTE_ADDR(x) "+m" (*(volatile char *) (x)) #define ADDR RLONG_ADDR(addr) /* * We do the locked ops that don't return the old value as * a mask operation on a byte. */ #define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "orb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr)) : "memory"); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline void arch___set_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "andb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline void arch_clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); arch_clear_bit(nr, addr); } static __always_inline void arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *addr) { bool negative; asm volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) : "iq" ((char)mask) : "memory"); return negative; } #define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) { arch___clear_bit(nr, addr); } static __always_inline void arch___change_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm volatile(LOCK_PREFIX "xorb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline bool arch_test_and_set_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); } static __always_inline bool arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return arch_test_and_set_bit(nr, addr); } static __always_inline bool arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm(__ASM_SIZE(bts) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_and_clear_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); } /* * Note: the operation is performed atomically with respect to * the local CPU, but not other CPUs. Portable code should not * rely on this behaviour. * KVM relies on this behaviour on x86 for modifying memory that is also * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ static __always_inline bool arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(btr) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(btc) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_and_change_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr) { bool oldbit; asm volatile("testb %2,%1" CC_SET(nz) : CC_OUT(nz) (oldbit) : "m" (((unsigned char *)addr)[nr >> 3]), "i" (1 << (nr & 7)) :"memory"); return oldbit; } static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(bt) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) : variable_test_bit(nr, addr); } static __always_inline bool arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) { return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) : variable_test_bit(nr, addr); } static __always_inline unsigned long variable__ffs(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; } /** * __ffs - find first set bit in word * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ #define __ffs(word) \ (__builtin_constant_p(word) ? \ (unsigned long)__builtin_ctzl(word) : \ variable__ffs(word)) static __always_inline unsigned long variable_ffz(unsigned long word) { asm("rep; bsf %1,%0" : "=r" (word) : "r" (~word)); return word; } /** * ffz - find first zero bit in word * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ #define ffz(word) \ (__builtin_constant_p(word) ? \ (unsigned long)__builtin_ctzl(~word) : \ variable_ffz(word)) /* * __fls: find last set bit in word * @word: The word to search * * Undefined if no set bit exists, so code should check against 0 first. */ static __always_inline unsigned long __fls(unsigned long word) { if (__builtin_constant_p(word)) return BITS_PER_LONG - 1 - __builtin_clzl(word); asm("bsr %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; } #undef ADDR #ifdef __KERNEL__ static __always_inline int variable_ffs(int x) { int r; #ifdef CONFIG_X86_64 /* * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before, except that the * top 32 bits will be cleared. * * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ asm("bsfl %1,%0" : "=r" (r) : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "r" (-1)); #else asm("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); #endif return r + 1; } /** * ffs - find first set bit in word * @x: the word to search * * This is defined the same way as the libc and compiler builtin ffs * routines, therefore differs in spirit from the other bitops. * * ffs(value) returns 0 if value is 0 or the position of the first * set bit if value is nonzero. The first (least significant) bit * is at position 1. */ #define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x)) /** * fls - find last set bit in word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffs, but returns the position of the most significant set bit. * * fls(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ static __always_inline int fls(unsigned int x) { int r; if (__builtin_constant_p(x)) return x ? 32 - __builtin_clz(x) : 0; #ifdef CONFIG_X86_64 /* * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before, except that the * top 32 bits will be cleared. * * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ asm("bsrl %1,%0" : "=r" (r) : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); #else asm("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); #endif return r + 1; } /** * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #ifdef CONFIG_X86_64 static __always_inline int fls64(__u64 x) { int bitpos = -1; if (__builtin_constant_p(x)) return x ? 64 - __builtin_clzll(x) : 0; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before. */ asm("bsrq %1,%q0" : "+r" (bitpos) : ASM_INPUT_RM (x)); return bitpos + 1; } #else #include <asm-generic/bitops/fls64.h> #endif #include <asm-generic/bitops/sched.h> #include <asm/arch_hweight.h> #include <asm-generic/bitops/const_hweight.h> #include <asm-generic/bitops/instrumented-atomic.h> #include <asm-generic/bitops/instrumented-non-atomic.h> #include <asm-generic/bitops/instrumented-lock.h> #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */
12 6 331 332 1 1 1 1 1 21 23 23 1 22 18 5 1 1 1 2 2 6 1 4 6 1 3 1 5 1 15 1 1 13 13 13 13 1 1 47 3 8 8 1 6 1 6 2 42 42 1 1 4 36 3 2 41 1 36 6 9 9 1 1 7 7 1 53 1 1 1 42 8 8 42 44 1 43 14 14 15 1 1 1 1 1 7 1 1 1 1 1 1 6 1 1 2 1 1 2 2 61 1 1 56 3 10 1 1 1 45 47 22 1 1 1 20 52 1 1 1 1 38 1 1 3 1 1 1 9 1 1 1 1 1 1 2 1 1 2 3 3 3 5 1 2 1 2 1 1 1 2 1 2 2 1 13 1 9 1 2 2 3 5 9 1 1 7 8 1 1 4 5 1 3 1 1 3 1 1 1 2 7 1 2 4 1 5 7 1 5 1 5 3 1 1 4 1 1 2 2 1 1 1 1 25 1 1 14 1 1 1 1 1 1 3 2 1 1 1 8 1 1 2 7 15 37 1 1 1 9 25 1 1 2 4 15 6 1 4 1 1 1 2 4 7 1 1 1 4 1 1 1 2 3 4 1 1 1 1 3 1 2 2 2 2 1 1 1 1 1 3 1 1 1 2 1 1 2 2 8 1 1 1 4 1 2 1 2 4 6 1 5 5 5 5 1 4 1 4 4 2 4 2 1 5 1 1 3 2 2 6 1 5 15 1 1 1 5 7 5 6 2 1 1 4 1 1 2 11 1 9 1 9 17 1 1 1 1 1 1 2 10 11 1 1 3 6 20 1 1 1 1 13 5 13 64 63 1 62 2 62 1 90 1 1 94 1 1 1 91 91 1 1 1 72 1 2 5 1 1 1 2 3 1 1 1 34 1 2 1 24 6 14 1 1 1 1 6 4 4 6 8 2 10 1 1 1 1 1 5 1 3 11 1 1 1 1 2 5 9 1 1 1 1 1 4 2 1 1 2 1 1 11 1 1 1 1 1 6 1 2 5 4 1 1 9 1 5 3 1 1 6 1 1 1 1 1 1 1 1 4 1 1 3 2 7 1 1 5 1 1 2 28 1 1 25 1 17 1 2 13 1 1 2 1 9 9 1 8 3 5 14 1 1 7 4 1 1 2 7 1 1 1 9 1 3 1 1 1 1 1 1 620 1 1 1 6 15 12 46 11 7 5 20 3 9 8 13 24 15 5 4 8 3 3 9 8 7 11 4 16 6 1 1 5 6 15 2 4 94 5 3 9 4 26 2 11 34 14 10 11 9 2 2 17 1 1 7 1 4 7 20 2 1 14 4 3 10 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/kernel.h> #include <linux/bio.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/security.h> #include <linux/xattr.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/uuid.h> #include <linux/btrfs.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/fsverity.h> #include <linux/sched/xacct.h> #include <linux/io_uring/cmd.h> #include "ctree.h" #include "disk-io.h" #include "export.h" #include "transaction.h" #include "btrfs_inode.h" #include "volumes.h" #include "locking.h" #include "backref.h" #include "send.h" #include "dev-replace.h" #include "props.h" #include "sysfs.h" #include "qgroup.h" #include "tree-log.h" #include "compression.h" #include "space-info.h" #include "block-group.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" #include "file.h" #include "scrub.h" #include "super.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI * structures are incorrect, as the timespec structure from userspace * is 4 bytes too small. We define these alternatives here to teach * the kernel about the 32-bit struct packing. */ struct btrfs_ioctl_timespec_32 { __u64 sec; __u32 nsec; } __attribute__ ((__packed__)); struct btrfs_ioctl_received_subvol_args_32 { char uuid[BTRFS_UUID_SIZE]; /* in */ __u64 stransid; /* in */ __u64 rtransid; /* out */ struct btrfs_ioctl_timespec_32 stime; /* in */ struct btrfs_ioctl_timespec_32 rtime; /* out */ __u64 flags; /* in */ __u64 reserved[16]; /* in */ } __attribute__ ((__packed__)); #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \ struct btrfs_ioctl_received_subvol_args_32) #endif #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_send_args_32 { __s64 send_fd; /* in */ __u64 clone_sources_count; /* in */ compat_uptr_t clone_sources; /* in */ __u64 parent_root; /* in */ __u64 flags; /* in */ __u32 version; /* in */ __u8 reserved[28]; /* in */ } __attribute__ ((__packed__)); #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ struct btrfs_ioctl_send_args_32) struct btrfs_ioctl_encoded_io_args_32 { compat_uptr_t iov; compat_ulong_t iovcnt; __s64 offset; __u64 flags; __u64 len; __u64 unencoded_len; __u64 unencoded_offset; __u32 compression; __u32 encryption; __u8 reserved[64]; }; #define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \ struct btrfs_ioctl_encoded_io_args_32) #define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \ struct btrfs_ioctl_encoded_io_args_32) #endif /* Mask out flags that are inappropriate for the given type of inode. */ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; else if (S_ISREG(inode->i_mode)) return flags & ~FS_DIRSYNC_FL; else return flags & (FS_NODUMP_FL | FS_NOATIME_FL); } /* * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) { unsigned int iflags = 0; u32 flags = binode->flags; u32 ro_flags = binode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; if (flags & BTRFS_INODE_IMMUTABLE) iflags |= FS_IMMUTABLE_FL; if (flags & BTRFS_INODE_APPEND) iflags |= FS_APPEND_FL; if (flags & BTRFS_INODE_NODUMP) iflags |= FS_NODUMP_FL; if (flags & BTRFS_INODE_NOATIME) iflags |= FS_NOATIME_FL; if (flags & BTRFS_INODE_DIRSYNC) iflags |= FS_DIRSYNC_FL; if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; if (ro_flags & BTRFS_INODE_RO_VERITY) iflags |= FS_VERITY_FL; if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; else if (flags & BTRFS_INODE_COMPRESS) iflags |= FS_COMPR_FL; return iflags; } /* * Update inode->i_flags based on the btrfs internal flags. */ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) { struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; if (binode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; if (binode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; if (binode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; if (binode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; if (binode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; if (binode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; set_mask_bits(&inode->i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } /* * Check if @flags are a supported and valid set of FS_*_FL flags and that * the old and new flags are not conflicting */ static int check_fsflags(unsigned int old_flags, unsigned int flags) { if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NOATIME_FL | FS_NODUMP_FL | \ FS_SYNC_FL | FS_DIRSYNC_FL | \ FS_NOCOMP_FL | FS_COMPR_FL | FS_NOCOW_FL)) return -EOPNOTSUPP; /* COMPR and NOCOMP on new/old are valid */ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) return -EINVAL; if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL)) return -EINVAL; /* NOCOW and compression options are mutually exclusive */ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL))) return -EINVAL; if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL))) return -EINVAL; return 0; } static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) return -EPERM; return 0; } int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args) { if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL) return -ENAMETOOLONG; return 0; } static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2) { if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL) return -ENAMETOOLONG; return 0; } /* * Set flags/xflags from the internal inode flags. The remaining items of * fsxattr are zeroed. */ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); return 0; } int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; u32 binode_flags; if (btrfs_root_readonly(root)) return -EROFS; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); old_fsflags = btrfs_inode_flags_to_fsflags(binode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; ret = check_fsflags_compatible(fs_info, fsflags); if (ret) return ret; binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) binode_flags |= BTRFS_INODE_SYNC; else binode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) binode_flags |= BTRFS_INODE_IMMUTABLE; else binode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) binode_flags |= BTRFS_INODE_APPEND; else binode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) binode_flags |= BTRFS_INODE_NODUMP; else binode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) binode_flags |= BTRFS_INODE_NOATIME; else binode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { /* 1 item for the inode */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); goto update_flags; } if (fsflags & FS_DIRSYNC_FL) binode_flags |= BTRFS_INODE_DIRSYNC; else binode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { if (S_ISREG(inode->i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ if (inode->i_size == 0) binode_flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } else { binode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ if (S_ISREG(inode->i_mode)) { if (inode->i_size == 0) binode_flags &= ~(BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM); } else { binode_flags &= ~BTRFS_INODE_NODATACOW; } } /* * The COMPRESS flag can only be changed by users, while the NOCOMPRESS * flag may be changed automatically if compression code won't make * things smaller. */ if (fsflags & FS_NOCOMP_FL) { binode_flags &= ~BTRFS_INODE_COMPRESS; binode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { if (IS_SWAPFILE(inode)) return -ETXTBSY; binode_flags |= BTRFS_INODE_COMPRESS; binode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* * 1 for inode item * 2 for properties */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) return PTR_ERR(trans); if (comp) { ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); return ret; } static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); } static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_device *device; struct fstrim_range range; u64 minlen = ULLONG_MAX; u64 num_devices = 0; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * btrfs_trim_block_group() depends on space cache, which is not * available in zoned filesystem. So, disallow fitrim on a zoned * filesystem for now. */ if (btrfs_is_zoned(fs_info)) return -EOPNOTSUPP; /* * If the fs is mounted with nologreplay, which requires it to be * mounted in RO mode as well, we can not allow discard on free space * inside block groups, because log trees refer to extents that are not * pinned in a block group's free space cache (pinning the extents is * precisely the first phase of replaying a log tree). */ if (btrfs_test_opt(fs_info, NOLOGREPLAY)) return -EROFS; rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { if (!device->bdev || !bdev_max_discard_sectors(device->bdev)) continue; num_devices++; minlen = min_t(u64, bdev_discard_granularity(device->bdev), minlen); } rcu_read_unlock(); if (!num_devices) return -EOPNOTSUPP; if (copy_from_user(&range, arg, sizeof(range))) return -EFAULT; /* * NOTE: Don't truncate the range using super->total_bytes. Bytenr of * block group is in the logical address space, which can be any * sectorsize aligned bytenr in the range [0, U64_MAX]. */ if (range.len < fs_info->sectorsize) return -EINVAL; range.minlen = max(range.minlen, minlen); ret = btrfs_trim_fs(fs_info, &range); if (copy_to_user(arg, &range, sizeof(range))) return -EFAULT; return ret; } /* * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. */ static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) { /* * 1 to add root block * 1 to add root item * 1 to add root ref * 1 to add root backref * 1 to add UUID item * 1 to add qgroup info * 1 to add qgroup limit * * Ideally the last two would only be accounted if qgroups are enabled, * but that can change between now and the time we would insert them. */ unsigned int num_items = 7; if (inherit) { /* 2 to add qgroup relations for each inherited qgroup */ num_items += 2 * inherit->num_qgroups; } return num_items; } static noinline int create_subvol(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec64 cur_time = current_time(dir); struct btrfs_new_inode_args new_inode_args = { .dir = dir, .dentry = dentry, .subvol = true, }; unsigned int trans_num_items; int ret; dev_t anon_dev; u64 objectid; u64 qgroup_reserved = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) return -ENOMEM; ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) goto out_root_item; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; goto out_root_item; } ret = get_anon_bdev(&anon_dev); if (ret < 0) goto out_root_item; new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); if (!new_inode_args.inode) { ret = -ENOMEM; goto out_anon_dev; } ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) goto out_inode; trans_num_items += create_subvol_num_items(inherit); btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, trans_num_items, false); if (ret) goto out_new_inode_args; qgroup_reserved = block_rsv.qgroup_rsv_reserved; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release_rsv; } btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit); if (ret) goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto out; } btrfs_mark_buffer_dirty(trans, leaf); inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, fs_info->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_flags(root_item, 0); btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); btrfs_set_root_bytenr(root_item, leaf->start); btrfs_set_root_generation(root_item, trans->transid); btrfs_set_root_level(root_item, 0); btrfs_set_root_refs(root_item, 1); btrfs_set_root_used(root_item, leaf->len); btrfs_set_root_last_snapshot(root_item, 0); btrfs_set_root_generation_v2(root_item, btrfs_root_generation(root_item)); generate_random_guid(root_item->uuid); btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); root_item->ctime = root_item->otime; btrfs_set_root_ctransid(root_item, trans->transid); btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { int ret2; /* * Since we don't abort the transaction in this case, free the * tree block so that we don't leak space and leave the * filesystem in an inconsistent state (an extent item in the * extent tree with a backreference for a root that does not * exists). */ btrfs_tree_lock(leaf); btrfs_clear_buffer_dirty(trans, leaf); btrfs_tree_unlock(leaf); ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1); if (ret2 < 0) btrfs_abort_transaction(trans, ret2); free_extent_buffer(leaf); goto out; } free_extent_buffer(leaf); leaf = NULL; new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); goto out; } /* anon_dev is owned by new_root now. */ anon_dev = 0; BTRFS_I(new_inode_args.inode)->root = new_root; /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_record_new_subvolume(trans, BTRFS_I(dir)); d_instantiate_new(dentry, new_inode_args.inode); new_inode_args.inode = NULL; out: trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_end_transaction(trans); out_release_rsv: btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL); if (qgroup_reserved) btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: iput(new_inode_args.inode); out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); out_root_item: kfree(root_item); return ret; } static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, bool readonly, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; unsigned int trans_num_items; struct btrfs_trans_handle *trans; struct btrfs_block_rsv *block_rsv; u64 qgroup_reserved = 0; int ret; /* We do not support snapshotting right now. */ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_warn(fs_info, "extent tree v2 doesn't support snapshotting yet"); return -EOPNOTSUPP; } if (btrfs_root_refs(&root->root_item) == 0) return -ENOENT; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return -EINVAL; if (atomic_read(&root->nr_swapfiles)) { btrfs_warn(fs_info, "cannot snapshot subvolume with active swapfile"); return -ETXTBSY; } pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL); if (!pending_snapshot) return -ENOMEM; ret = get_anon_bdev(&pending_snapshot->anon_dev); if (ret < 0) goto free_pending; pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), GFP_KERNEL); pending_snapshot->path = btrfs_alloc_path(); if (!pending_snapshot->root_item || !pending_snapshot->path) { ret = -ENOMEM; goto free_pending; } block_rsv = &pending_snapshot->block_rsv; btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item */ trans_num_items = create_subvol_num_items(inherit) + 3; ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv, trans_num_items, false); if (ret) goto free_pending; qgroup_reserved = block_rsv->qgroup_rsv_reserved; pending_snapshot->dentry = dentry; pending_snapshot->root = root; pending_snapshot->readonly = readonly; pending_snapshot->dir = BTRFS_I(dir); pending_snapshot->inherit = inherit; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fail; } ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root); if (ret) { btrfs_end_transaction(trans); goto fail; } btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); qgroup_reserved = 0; trans->pending_snapshot = pending_snapshot; ret = btrfs_commit_transaction(trans); if (ret) goto fail; ret = pending_snapshot->error; if (ret) goto fail; ret = btrfs_orphan_cleanup(pending_snapshot->snap); if (ret) goto fail; inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; } d_instantiate(dentry, inode); ret = 0; pending_snapshot->anon_dev = 0; fail: /* Prevent double freeing of anon_dev */ if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL); if (qgroup_reserved) btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); kfree(pending_snapshot); return ret; } /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do anything with * links pointing to it. * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 9. We can't remove a root or mountpoint. * 10. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int btrfs_may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, int isdir) { int error; if (d_really_is_negative(victim)) return -ENOENT; /* The @victim is not inside @dir. */ if (d_inode(victim->d_parent) != dir) return -EINVAL; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } /* * Create a new subvolume below @parent. This is largely modeled after * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *snap_src, bool readonly, struct btrfs_qgroup_inherit *inherit) { struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct dentry *dentry; struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (error == -EINTR) return error; dentry = lookup_one(idmap, name, parent->dentry, namelen); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; error = btrfs_may_create(idmap, dir, dentry); if (error) goto out_dput; /* * even if this name doesn't exist, we may get hash collisions. * check for them now when we can safely fail */ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str); if (error) goto out_dput; down_read(&fs_info->subvol_sem); if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) goto out_up_read; if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else error = create_subvol(idmap, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); out_up_read: up_read(&fs_info->subvol_sem); out_dput: dput(dentry); out_unlock: btrfs_inode_unlock(BTRFS_I(dir), 0); return error; } static noinline int btrfs_mksnapshot(const struct path *parent, struct mnt_idmap *idmap, const char *name, int namelen, struct btrfs_root *root, bool readonly, struct btrfs_qgroup_inherit *inherit) { int ret; /* * Force new buffered writes to reserve space even when NOCOW is * possible. This is to avoid later writeback (running dealloc) to * fallback to COW mode and unexpectedly fail with ENOSPC. */ btrfs_drew_read_lock(&root->snapshot_lock); ret = btrfs_start_delalloc_snapshot(root, false); if (ret) goto out; /* * All previous writes have started writeback in NOCOW mode, so now * we force future writes to fallback to COW mode during snapshot * creation. */ atomic_inc(&root->snapshot_force_cow); btrfs_wait_ordered_extents(root, U64_MAX, NULL); ret = btrfs_mksubvol(parent, idmap, name, namelen, root, readonly, inherit); atomic_dec(&root->snapshot_force_cow); out: btrfs_drew_read_unlock(&root->snapshot_lock); return ret; } /* * Try to start exclusive operation @type or cancel it if it's running. * * Return: * 0 - normal mode, newly claimed op started * >0 - normal mode, something else is running, * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space * ECANCELED - cancel mode, successful cancel * ENOTCONN - cancel mode, operation not running anymore */ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type, bool cancel) { if (!cancel) { /* Start normal op */ if (!btrfs_exclop_start(fs_info, type)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; /* Exclusive operation is now claimed */ return 0; } /* Cancel running op */ if (btrfs_exclop_start_try_lock(fs_info, type)) { /* * This blocks any exclop finish from setting it to NONE, so we * request cancellation. Either it runs and we will wait for it, * or it has finished and no waiting will happen. */ atomic_inc(&fs_info->reloc_cancel_req); btrfs_exclop_start_unlock(fs_info); if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING, TASK_INTERRUPTIBLE); return -ECANCELED; } /* Something else is running or none */ return -ENOTCONN; } static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 new_size; u64 old_size; u64 devid = 1; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; /* * Read the arguments before checking exclusivity to be able to * distinguish regular resize and cancel */ vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); goto out_drop; } ret = btrfs_check_ioctl_vol_args_path(vol_args); if (ret < 0) goto out_free; sizestr = vol_args->name; cancel = (strcmp("cancel", sizestr) == 0); ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); if (ret) goto out_free; /* Exclusive operation is now claimed */ devstr = strchr(sizestr, ':'); if (devstr) { sizestr = devstr + 1; *devstr = '\0'; devstr = vol_args->name; ret = kstrtoull(devstr, 10, &devid); if (ret) goto out_finish; if (!devid) { ret = -EINVAL; goto out_finish; } btrfs_info(fs_info, "resizing devid %llu", devid); } args.devid = devid; device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; goto out_finish; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_info(fs_info, "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; goto out_finish; } if (!strcmp(sizestr, "max")) new_size = bdev_nr_bytes(device->bdev); else { if (sizestr[0] == '-') { mod = -1; sizestr++; } else if (sizestr[0] == '+') { mod = 1; sizestr++; } new_size = memparse(sizestr, &retptr); if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; goto out_finish; } } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -EPERM; goto out_finish; } old_size = btrfs_device_get_total_bytes(device); if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; goto out_finish; } new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { ret = -ERANGE; goto out_finish; } new_size = old_size + new_size; } if (new_size < SZ_256M) { ret = -EINVAL; goto out_finish; } if (new_size > bdev_nr_bytes(device->bdev)) { ret = -EFBIG; goto out_finish; } new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_finish; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans); } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } /* equal, nothing need to do */ if (ret == 0 && new_size != old_size) btrfs_info_in_rcu(fs_info, "resize device %s (devid %llu) from %llu to %llu", btrfs_dev_name(device), device->devid, old_size, new_size); out_finish: btrfs_exclop_finish(fs_info); out_free: kfree(vol_args); out_drop: mnt_drop_write_file(file); return ret; } static noinline int __btrfs_ioctl_snap_create(struct file *file, struct mnt_idmap *idmap, const char *name, unsigned long fd, int subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) { int namelen; int ret = 0; if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; ret = mnt_want_write_file(file); if (ret) goto out; namelen = strlen(name); if (strchr(name, '/')) { ret = -EINVAL; goto out_drop_write; } if (name[0] == '.' && (namelen == 1 || (name[1] == '.' && namelen == 2))) { ret = -EEXIST; goto out_drop_write; } if (subvol) { ret = btrfs_mksubvol(&file->f_path, idmap, name, namelen, NULL, readonly, inherit); } else { CLASS(fd, src)(fd); struct inode *src_inode; if (fd_empty(src)) { ret = -EINVAL; goto out_drop_write; } src_inode = file_inode(fd_file(src)); if (src_inode->i_sb != file_inode(file)->i_sb) { btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(idmap, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only */ ret = -EPERM; } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { /* * Snapshots must be made with the src_inode referring * to the subvolume inode, otherwise the permission * checking above is useless because we may have * permission on a lower directory but not the subvol * itself. */ ret = -EINVAL; } else { ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, BTRFS_I(src_inode)->root, readonly, inherit); } } out_drop_write: mnt_drop_write_file(file); out: return ret; } static noinline int btrfs_ioctl_snap_create(struct file *file, void __user *arg, int subvol) { struct btrfs_ioctl_vol_args *vol_args; int ret; if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); ret = btrfs_check_ioctl_vol_args_path(vol_args); if (ret < 0) goto out; ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, false, NULL); out: kfree(vol_args); return ret; } static noinline int btrfs_ioctl_snap_create_v2(struct file *file, void __user *arg, int subvol) { struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; if (!S_ISDIR(file_inode(file)->i_mode)) return -ENOTDIR; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); if (ret < 0) goto free_args; if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) { ret = -EOPNOTSUPP; goto free_args; } if (vol_args->flags & BTRFS_SUBVOL_RDONLY) readonly = true; if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); if (vol_args->size < sizeof(*inherit) || vol_args->size > PAGE_SIZE) { ret = -EINVAL; goto free_args; } inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); if (IS_ERR(inherit)) { ret = PTR_ERR(inherit); goto free_args; } ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size); if (ret < 0) goto free_inherit; } ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file), vol_args->name, vol_args->fd, subvol, readonly, inherit); if (ret) goto free_inherit; free_inherit: kfree(inherit); free_args: kfree(vol_args); return ret; } static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, void __user *arg) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); if (btrfs_root_readonly(root)) flags |= BTRFS_SUBVOL_RDONLY; up_read(&fs_info->subvol_sem); if (copy_to_user(arg, &flags, sizeof(flags))) ret = -EFAULT; return ret; } static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; u64 flags; int ret = 0; if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) return -EPERM; ret = mnt_want_write_file(file); if (ret) goto out; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out_drop_write; } if (copy_from_user(&flags, arg, sizeof(flags))) { ret = -EFAULT; goto out_drop_write; } if (flags & ~BTRFS_SUBVOL_RDONLY) { ret = -EOPNOTSUPP; goto out_drop_write; } down_write(&fs_info->subvol_sem); /* nothing to do */ if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) goto out_drop_sem; root_flags = btrfs_root_flags(&root->root_item); if (flags & BTRFS_SUBVOL_RDONLY) { btrfs_set_root_flags(&root->root_item, root_flags | BTRFS_ROOT_SUBVOL_RDONLY); } else { /* * Block RO -> RW transition if this subvolume is involved in * send */ spin_lock(&root->root_item_lock); if (root->send_in_progress == 0) { btrfs_set_root_flags(&root->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); spin_unlock(&root->root_item_lock); } else { spin_unlock(&root->root_item_lock); btrfs_warn(fs_info, "Attempt to set subvolume %llu read-write during send", btrfs_root_id(root)); ret = -EPERM; goto out_drop_sem; } } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_reset; } ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { btrfs_end_transaction(trans); goto out_reset; } ret = btrfs_commit_transaction(trans); out_reset: if (ret) btrfs_set_root_flags(&root->root_item, root_flags); out_drop_sem: up_write(&fs_info->subvol_sem); out_drop_write: mnt_drop_write_file(file); out: return ret; } static noinline int key_in_sk(struct btrfs_key *key, struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; test.objectid = sk->min_objectid; test.type = sk->min_type; test.offset = sk->min_offset; ret = btrfs_comp_cpu_keys(key, &test); if (ret < 0) return 0; test.objectid = sk->max_objectid; test.type = sk->max_type; test.offset = sk->max_offset; ret = btrfs_comp_cpu_keys(key, &test); if (ret > 0) return 0; return 1; } static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) { u64 found_transid; struct extent_buffer *leaf; struct btrfs_ioctl_search_header sh; struct btrfs_key test; unsigned long item_off; unsigned long item_len; int nritems; int i; int slot; int ret = 0; leaf = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); if (btrfs_header_generation(leaf) > sk->max_transid) { i = nritems; goto advance_key; } found_transid = btrfs_header_generation(leaf); for (i = slot; i < nritems; i++) { item_off = btrfs_item_ptr_offset(leaf, i); item_len = btrfs_item_size(leaf, i); btrfs_item_key_to_cpu(leaf, key, i); if (!key_in_sk(key, sk)) continue; if (sizeof(sh) + item_len > *buf_size) { if (*num_found) { ret = 1; goto out; } /* * return one empty item back for v1, which does not * handle -EOVERFLOW */ *buf_size = sizeof(sh) + item_len; item_len = 0; ret = -EOVERFLOW; } if (sizeof(sh) + item_len + *sk_offset > *buf_size) { ret = 1; goto out; } sh.objectid = key->objectid; sh.offset = key->offset; sh.type = key->type; sh.len = item_len; sh.transid = found_transid; /* * Copy search result header. If we fault then loop again so we * can fault in the pages and -EFAULT there if there's a * problem. Otherwise we'll fault and then copy the buffer in * properly this next time through */ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { ret = 0; goto out; } *sk_offset += sizeof(sh); if (item_len) { char __user *up = ubuf + *sk_offset; /* * Copy the item, same behavior as above, but reset the * * sk_offset so we copy the full thing again. */ if (read_extent_buffer_to_user_nofault(leaf, up, item_off, item_len)) { ret = 0; *sk_offset -= sizeof(sh); goto out; } *sk_offset += item_len; } (*num_found)++; if (ret) /* -EOVERFLOW from above */ goto out; if (*num_found >= sk->nr_items) { ret = 1; goto out; } } advance_key: ret = 0; test.objectid = sk->max_objectid; test.type = sk->max_type; test.offset = sk->max_offset; if (btrfs_comp_cpu_keys(key, &test) >= 0) ret = 1; else if (key->offset < (u64)-1) key->offset++; else if (key->type < (u8)-1) { key->offset = 0; key->type++; } else if (key->objectid < (u64)-1) { key->offset = 0; key->type = 0; key->objectid++; } else ret = 1; out: /* * 0: all items from this leaf copied, continue with next * 1: * more items can be copied, but unused buffer is too small * * all items were found * Either way, it will stops the loop which iterates to the next * leaf * -EOVERFLOW: item was to large for buffer * -EFAULT: could not copy extent buffer back to userspace */ return ret; } static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = inode_to_fs_info(inode); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; int ret; int num_found = 0; unsigned long sk_offset = 0; if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) { *buf_size = sizeof(struct btrfs_ioctl_search_header); return -EOVERFLOW; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (sk->tree_id == 0) { /* search the root of the inode that was passed */ root = btrfs_grab_root(BTRFS_I(inode)->root); } else { root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); return PTR_ERR(root); } } key.objectid = sk->min_objectid; key.type = sk->min_type; key.offset = sk->min_offset; while (1) { ret = -EFAULT; /* * Ensure that the whole user buffer is faulted in at sub-page * granularity, otherwise the loop may live-lock. */ if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) ret = 0; goto err; } ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); if (ret) break; } if (ret > 0) ret = 0; err: sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } static noinline int btrfs_ioctl_tree_search(struct inode *inode, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; buf_size = sizeof(uargs->buf); ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a * search header with a len of zero, so reset ret. */ if (ret == -EOVERFLOW) ret = 0; if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk))) ret = -EFAULT; return ret; } static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; u64 buf_size; const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* copy search header and buffer size */ if (copy_from_user(&args, uarg, sizeof(args))) return -EFAULT; buf_size = args.buf_size; /* limit result size to 16MB */ if (buf_size > buf_limit) buf_size = buf_limit; ret = search_ioctl(inode, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; else if (ret == -EOVERFLOW && copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size))) ret = -EFAULT; return ret; } /* * Search INODE_REFs to identify path name of 'dirid' directory * in a 'tree_id' tree. and sets path name to 'name'. */ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, u64 tree_id, u64 dirid, char *name) { struct btrfs_root *root; struct btrfs_key key; char *ptr; int ret = -1; int slot; int len; int total_len = 0; struct btrfs_inode_ref *iref; struct extent_buffer *l; struct btrfs_path *path; if (dirid == BTRFS_FIRST_FREE_OBJECTID) { name[0]='\0'; return 0; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; root = btrfs_get_fs_root(info, tree_id, true); if (IS_ERR(root)) { ret = PTR_ERR(root); root = NULL; goto out; } key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; else if (ret > 0) { ret = -ENOENT; goto out; } l = path->nodes[0]; slot = path->slots[0]; iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(l, iref); ptr -= len + 1; total_len += len + 1; if (ptr < name) { ret = -ENAMETOOLONG; goto out; } *(ptr + len) = '/'; read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len); if (key.offset == BTRFS_FIRST_FREE_OBJECTID) break; btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; } memmove(name, ptr, total_len); name[total_len] = '\0'; ret = 0; out: btrfs_put_root(root); btrfs_free_path(path); return ret; } static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct inode *inode, struct btrfs_ioctl_ino_lookup_user_args *args) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u64 upper_limit = btrfs_ino(BTRFS_I(inode)); u64 treeid = btrfs_root_id(BTRFS_I(inode)->root); u64 dirid = args->dirid; unsigned long item_off; unsigned long item_len; struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct btrfs_root *root = NULL; struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; struct inode *temp_inode; char *ptr; int slot; int len; int total_len = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * If the bottom subvolume does not exist directly under upper_limit, * construct the path in from the bottom up. */ if (dirid != upper_limit) { ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; else if (ret > 0) { ret = -ENOENT; goto out_put; } leaf = path->nodes[0]; slot = path->slots[0]; iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(leaf, iref); ptr -= len + 1; total_len += len + 1; if (ptr < args->path) { ret = -ENAMETOOLONG; goto out_put; } *(ptr + len) = '/'; read_extent_buffer(leaf, ptr, (unsigned long)(iref + 1), len); /* Check the read+exec permission of this directory */ ret = btrfs_previous_item(root, path, dirid, BTRFS_INODE_ITEM_KEY); if (ret < 0) { goto out_put; } else if (ret > 0) { ret = -ENOENT; goto out_put; } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key2, slot); if (key2.objectid != dirid) { ret = -ENOENT; goto out_put; } /* * We don't need the path anymore, so release it and * avoid deadlocks and lockdep warnings in case * btrfs_iget() needs to lookup the inode from its root * btree and lock the same leaf. */ btrfs_release_path(path); temp_inode = btrfs_iget(key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; } ret = inode_permission(idmap, temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { ret = -EACCES; goto out_put; } if (key.offset == upper_limit) break; if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) { ret = -EACCES; goto out_put; } key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; } memmove(args->path, ptr, total_len); args->path[total_len] = '\0'; btrfs_put_root(root); root = NULL; btrfs_release_path(path); } /* Get the bottom subvolume's name from ROOT_REF */ key.objectid = treeid; key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0) { ret = -ENOENT; goto out; } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); item_off = btrfs_item_ptr_offset(leaf, slot); item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { ret = -EINVAL; goto out; } /* Copy subvolume's name */ item_off += sizeof(struct btrfs_root_ref); item_len -= sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, args->name, item_off, item_len); args->name[item_len] = 0; out_put: btrfs_put_root(root); out: btrfs_free_path(path); return ret; } static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_ino_lookup_args *args; int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); /* * Unprivileged query to obtain the containing subvolume root id. The * path is reset so it's consistent with btrfs_search_path_in_tree. */ if (args->treeid == 0) args->treeid = btrfs_root_id(root); if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { args->name[0] = 0; goto out; } if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } ret = btrfs_search_path_in_tree(root->fs_info, args->treeid, args->objectid, args->name); out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; kfree(args); return ret; } /* * Version of ino_lookup ioctl (unprivileged) * * The main differences from ino_lookup ioctl are: * * 1. Read + Exec permission will be checked using inode_permission() during * path construction. -EACCES will be returned in case of failure. * 2. Path construction will be stopped at the inode number which corresponds * to the fd with which this ioctl is called. If constructed path does not * exist under fd's inode, -EACCES will be returned. * 3. The name of bottom subvolume is also searched and filled. */ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) { struct btrfs_ioctl_ino_lookup_user_args *args; struct inode *inode; int ret; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) return PTR_ERR(args); inode = file_inode(file); if (args->dirid == BTRFS_FIRST_FREE_OBJECTID && btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { /* * The subvolume does not exist under fd with which this is * called */ kfree(args); return -EACCES; } ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; kfree(args); return ret; } /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp) { struct btrfs_ioctl_get_subvol_info_args *subvol_info; struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_root_item *root_item; struct btrfs_root_ref *rref; struct extent_buffer *leaf; unsigned long item_off; unsigned long item_len; int slot; int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL); if (!subvol_info) { btrfs_free_path(path); return -ENOMEM; } fs_info = BTRFS_I(inode)->root->fs_info; /* Get root_item of inode's subvolume */ key.objectid = btrfs_root_id(BTRFS_I(inode)->root); root = btrfs_get_fs_root(fs_info, key.objectid, true); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; } root_item = &root->root_item; subvol_info->treeid = key.objectid; subvol_info->generation = btrfs_root_generation(root_item); subvol_info->flags = btrfs_root_flags(root_item); memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE); memcpy(subvol_info->parent_uuid, root_item->parent_uuid, BTRFS_UUID_SIZE); memcpy(subvol_info->received_uuid, root_item->received_uuid, BTRFS_UUID_SIZE); subvol_info->ctransid = btrfs_root_ctransid(root_item); subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime); subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime); subvol_info->otransid = btrfs_root_otransid(root_item); subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime); subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime); subvol_info->stransid = btrfs_root_stransid(root_item); subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime); subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime); subvol_info->rtransid = btrfs_root_rtransid(root_item); subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime); subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime); if (key.objectid != BTRFS_FS_TREE_OBJECTID) { /* Search root tree for ROOT_BACKREF of this subvolume */ key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(fs_info->tree_root, path); if (ret < 0) { goto out; } else if (ret > 0) { ret = -EUCLEAN; goto out; } } leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid == subvol_info->treeid && key.type == BTRFS_ROOT_BACKREF_KEY) { subvol_info->parent_id = key.offset; rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref); item_off = btrfs_item_ptr_offset(leaf, slot) + sizeof(struct btrfs_root_ref); item_len = btrfs_item_size(leaf, slot) - sizeof(struct btrfs_root_ref); read_extent_buffer(leaf, subvol_info->name, item_off, item_len); } else { ret = -ENOENT; goto out; } } btrfs_free_path(path); path = NULL; if (copy_to_user(argp, subvol_info, sizeof(*subvol_info))) ret = -EFAULT; out: btrfs_put_root(root); out_free: btrfs_free_path(path); kfree(subvol_info); return ret; } /* * Return ROOT_REF information of the subvolume containing this inode * except the subvolume name. */ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_get_subvol_rootref_args *rootrefs; struct btrfs_root_ref *rref; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *leaf; u64 objectid; int slot; int ret; u8 found; path = btrfs_alloc_path(); if (!path) return -ENOMEM; rootrefs = memdup_user(argp, sizeof(*rootrefs)); if (IS_ERR(rootrefs)) { btrfs_free_path(path); return PTR_ERR(rootrefs); } objectid = btrfs_root_id(root); key.objectid = objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = rootrefs->min_treeid; found = 0; root = root->fs_info->tree_root; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) { goto out; } else if (ret > 0) { ret = -EUCLEAN; goto out; } } while (1) { leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) { ret = 0; goto out; } if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) { ret = -EOVERFLOW; goto out; } rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); rootrefs->rootref[found].treeid = key.offset; rootrefs->rootref[found].dirid = btrfs_root_ref_dirid(leaf, rref); found++; ret = btrfs_next_item(root, path); if (ret < 0) { goto out; } else if (ret > 0) { ret = -EUCLEAN; goto out; } } out: btrfs_free_path(path); if (!ret || ret == -EOVERFLOW) { rootrefs->num_items = found; /* update min_treeid for next search */ if (found) rootrefs->min_treeid = rootrefs->rootref[found - 1].treeid + 1; if (copy_to_user(argp, rootrefs, sizeof(*rootrefs))) ret = -EFAULT; } kfree(rootrefs); return ret; } static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg, bool destroy_v2) { struct dentry *parent = file->f_path.dentry; struct dentry *dentry; struct inode *dir = d_inode(parent); struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args = NULL; struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; int ret = 0; bool destroy_parent = false; /* We don't support snapshots with extent tree v2 yet. */ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } if (destroy_v2) { vol_args2 = memdup_user(arg, sizeof(*vol_args2)); if (IS_ERR(vol_args2)) return PTR_ERR(vol_args2); if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } /* * If SPEC_BY_ID is not set, we are looking for the subvolume by * name, same as v1 currently does. */ if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) { ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2); if (ret < 0) goto out; subvol_name = vol_args2->name; ret = mnt_want_write_file(file); if (ret) goto out; } else { struct inode *old_dir; if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out; } ret = mnt_want_write_file(file); if (ret) goto out; dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, vol_args2->subvolid, 0); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_drop_write; } /* * Change the default parent since the subvolume being * deleted can be outside of the current mount point. */ parent = btrfs_get_parent(dentry); /* * At this point dentry->d_name can point to '/' if the * subvolume we want to destroy is outsite of the * current mount point, so we need to release the * current dentry and execute the lookup to return a new * one with ->d_name pointing to the * <mount point>/subvol_name. */ dput(dentry); if (IS_ERR(parent)) { ret = PTR_ERR(parent); goto out_drop_write; } old_dir = dir; dir = d_inode(parent); /* * If v2 was used with SPEC_BY_ID, a new parent was * allocated since the subvolume can be outside of the * current mount point. Later on we need to release this * new parent dentry. */ destroy_parent = true; /* * On idmapped mounts, deletion via subvolid is * restricted to subvolumes that are immediate * ancestors of the inode referenced by the file * descriptor in the ioctl. Otherwise the idmapping * could potentially be abused to delete subvolumes * anywhere in the filesystem the user wouldn't be able * to delete without an idmapped mount. */ if (old_dir != dir && idmap != &nop_mnt_idmap) { ret = -EOPNOTSUPP; goto free_parent; } subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { ret = PTR_ERR(subvol_name_ptr); goto free_parent; } /* subvol_name_ptr is already nul terminated */ subvol_name = (char *)kbasename(subvol_name_ptr); } } else { vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); ret = btrfs_check_ioctl_vol_args_path(vol_args); if (ret < 0) goto out; subvol_name = vol_args->name; ret = mnt_want_write_file(file); if (ret) goto out; } subvol_namelen = strlen(subvol_name); if (strchr(subvol_name, '/') || strncmp(subvol_name, "..", subvol_namelen) == 0) { ret = -EINVAL; goto free_subvol_name; } if (!S_ISDIR(dir->i_mode)) { ret = -ENOTDIR; goto free_subvol_name; } ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (ret == -EINTR) goto free_subvol_name; dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_unlock_dir; } if (d_really_is_negative(dentry)) { ret = -ENOENT; goto out_dput; } inode = d_inode(dentry); dest = BTRFS_I(inode)->root; if (!capable(CAP_SYS_ADMIN)) { /* * Regular user. Only allow this with a special mount * option, when the user has write+exec access to the * subvol root, and when rmdir(2) would have been * allowed. * * Note that this is _not_ check that the subvol is * empty or doesn't contain data that we wouldn't * otherwise be able to delete. * * Users who want to delete empty subvols should try * rmdir(2). */ ret = -EPERM; if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* * Do not allow deletion if the parent dir is the same * as the dir to be deleted. That means the ioctl * must be called on the dentry referencing the root * of the subvol, not a random directory contained * within it. */ ret = -EINVAL; if (root == dest) goto out_dput; ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC); if (ret) goto out_dput; } /* check if subvolume may be deleted by a user */ ret = btrfs_may_delete(idmap, dir, dentry, 1); if (ret) goto out_dput; if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out_dput; } btrfs_inode_lock(BTRFS_I(inode), 0); ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry); btrfs_inode_unlock(BTRFS_I(inode), 0); if (!ret) d_delete_notify(dir, dentry); out_dput: dput(dentry); out_unlock_dir: btrfs_inode_unlock(BTRFS_I(dir), 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: if (destroy_parent) dput(parent); out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args2); kfree(vol_args); return ret; } static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_defrag_range_args range = {0}; int ret; ret = mnt_want_write_file(file); if (ret) return ret; if (btrfs_root_readonly(root)) { ret = -EROFS; goto out; } switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out; } ret = btrfs_defrag_root(root); break; case S_IFREG: /* * Note that this does not check the file descriptor for write * access. This prevents defragmenting executables that are * running and allows defrag on files open in read-only mode. */ if (!capable(CAP_SYS_ADMIN) && inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) { ret = -EPERM; goto out; } /* * Don't allow defrag on pre-content watched files, as it could * populate the page cache with 0's via readahead. */ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) { ret = -EINVAL; goto out; } if (argp) { if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; goto out; } if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) { ret = -EOPNOTSUPP; goto out; } /* compression requires us to start the IO */ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { range.flags |= BTRFS_DEFRAG_RANGE_START_IO; range.extent_thresh = (u32)-1; } } else { /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } ret = btrfs_defrag_file(file_inode(file), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; break; default: ret = -EINVAL; } out: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; bool restore_op = false; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "device add not supported on extent tree v2 yet"); return -EINVAL; } if (fs_info->fs_devices->temp_fsid) { btrfs_err(fs_info, "device add not supported on cloned temp-fsid mount"); return -EINVAL; } if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; /* * We can do the device add because we have a paused balanced, * change the exclusive op type and remember we should bring * back the paused balance */ fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD; btrfs_exclop_start_unlock(fs_info); restore_op = true; } vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); goto out; } ret = btrfs_check_ioctl_vol_args_path(vol_args); if (ret < 0) goto out_free; ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) btrfs_info(fs_info, "disk added %s", vol_args->name); out_free: kfree(vol_args); out: if (restore_op) btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); else btrfs_exclop_finish(fs_info); return ret; } static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args_v2 *vol_args; struct file *bdev_file = NULL; int ret; bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args); if (ret < 0) goto out; if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { args.devid = vol_args->devid; } else if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); if (ret) goto out; } ret = mnt_want_write_file(file); if (ret) goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret) goto err_drop; /* Exclusive operation is now claimed */ ret = btrfs_rm_device(fs_info, &args, &bdev_file); btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) btrfs_info(fs_info, "device deleted: id %llu", vol_args->devid); else btrfs_info(fs_info, "device deleted: %s", vol_args->name); } err_drop: mnt_drop_write_file(file); if (bdev_file) fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); return ret; } static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_vol_args *vol_args; struct file *bdev_file = NULL; int ret; bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); ret = btrfs_check_ioctl_vol_args_path(vol_args); if (ret < 0) goto out_free; if (!strcmp("cancel", vol_args->name)) { cancel = true; } else { ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); if (ret) goto out; } ret = mnt_want_write_file(file); if (ret) goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { ret = btrfs_rm_device(fs_info, &args, &bdev_file); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); if (bdev_file) fput(bdev_file); out: btrfs_put_dev_args_from_path(&args); out_free: kfree(vol_args); return ret; } static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 flags_in; int ret = 0; fi_args = memdup_user(arg, sizeof(*fi_args)); if (IS_ERR(fi_args)) return PTR_ERR(fi_args); flags_in = fi_args->flags; memset(fi_args, 0, sizeof(*fi_args)); rcu_read_lock(); fi_args->num_devices = fs_devices->num_devices; list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } rcu_read_unlock(); memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid)); fi_args->nodesize = fs_info->nodesize; fi_args->sectorsize = fs_info->sectorsize; fi_args->clone_alignment = fs_info->sectorsize; if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) { fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy); fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy); fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO; } if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { fi_args->generation = btrfs_get_fs_generation(fs_info); fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) { memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid, sizeof(fi_args->metadata_uuid)); fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID; } if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; kfree(fi_args); return ret; } static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; int ret = 0; di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); args.devid = di_args->devid; if (!btrfs_is_empty_uuid(di_args->uuid)) args.uuid = di_args->uuid; rcu_read_lock(); dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { ret = -ENODEV; goto out; } di_args->devid = dev->devid; di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); if (dev->name) strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); else di_args->path[0] = '\0'; out: rcu_read_unlock(); if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) ret = -EFAULT; kfree(di_args); return ret; } static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; struct fscrypt_str name = FSTR_INIT("default", 7); u64 objectid = 0; u64 dir_id; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; if (copy_from_user(&objectid, argp, sizeof(objectid))) { ret = -EFAULT; goto out; } if (!objectid) objectid = BTRFS_FS_TREE_OBJECTID; new_root = btrfs_get_fs_root(fs_info, objectid, true); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); goto out; } if (!is_fstree(btrfs_root_id(new_root))) { ret = -ENOENT; goto out_free; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out_free; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_free; } dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, dir_id, &name, 1); if (IS_ERR_OR_NULL(di)) { btrfs_release_path(path); btrfs_end_transaction(trans); btrfs_err(fs_info, "Umm, you don't have the default diritem, this isn't going to work"); ret = -ENOENT; goto out_free; } btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); btrfs_end_transaction(trans); out_free: btrfs_put_root(new_root); btrfs_free_path(path); out: mnt_drop_write_file(file); return ret; } static void get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space) { struct btrfs_block_group *block_group; space->total_bytes = 0; space->used_bytes = 0; space->flags = 0; list_for_each_entry(block_group, groups_list, list) { space->flags = block_group->flags; space->total_bytes += block_group->length; space->used_bytes += block_group->used; } } static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; static const u64 types[] = { BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA, BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA }; int num_types = 4; int alloc_size; int ret = 0; u64 slot_count = 0; int i, c; if (copy_from_user(&space_args, (struct btrfs_ioctl_space_args __user *)arg, sizeof(space_args))) return -EFAULT; for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; info = NULL; list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } if (!info) continue; down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) slot_count++; } up_read(&info->groups_sem); } /* * Global block reserve, exported as a space_info */ slot_count++; /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; goto out; } slot_count = min_t(u64, space_args.space_slots, slot_count); alloc_size = sizeof(*dest) * slot_count; /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ if (alloc_size > PAGE_SIZE) return -ENOMEM; space_args.total_spaces = 0; dest = kmalloc(alloc_size, GFP_KERNEL); if (!dest) return -ENOMEM; dest_orig = dest; /* now we have a buffer to copy into */ for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; if (!slot_count) break; info = NULL; list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } if (!info) continue; down_read(&info->groups_sem); for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { if (!list_empty(&info->block_groups[c])) { get_block_group_info(&info->block_groups[c], &space); memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; slot_count--; } if (!slot_count) break; } up_read(&info->groups_sem); } /* * Add global block reserve */ if (slot_count) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; spin_lock(&block_rsv->lock); space.total_bytes = block_rsv->size; space.used_bytes = block_rsv->size - block_rsv->reserved; spin_unlock(&block_rsv->lock); space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV; memcpy(dest, &space, sizeof(space)); space_args.total_spaces++; } user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) ret = -EFAULT; kfree(dest_orig); out: if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) ret = -EFAULT; return ret; } static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, void __user *argp) { struct btrfs_trans_handle *trans; u64 transid; /* * Start orphan cleanup here for the given root in case it hasn't been * started already by other means. Errors are handled in the other * functions during transaction commit. */ btrfs_orphan_cleanup(root); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) return PTR_ERR(trans); /* No running transaction, don't bother */ transid = btrfs_get_last_trans_committed(root->fs_info); goto out; } transid = trans->transid; btrfs_commit_transaction_async(trans); out: if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) return -EFAULT; return 0; } static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, void __user *argp) { /* By default wait for the current transaction. */ u64 transid = 0; if (argp) if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; return btrfs_wait_for_commit(fs_info, transid); } static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file)); struct btrfs_ioctl_scrub_args *sa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); return -EINVAL; } sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) { ret = -EOPNOTSUPP; goto out; } if (!(sa->flags & BTRFS_SCRUB_READONLY)) { ret = mnt_want_write_file(file); if (ret) goto out; } ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); /* * Copy scrub args to user space even if btrfs_scrub_dev() returned an * error. This is important as it allows user space to know how much * progress scrub has done. For example, if scrub is canceled we get * -ECANCELED from btrfs_scrub_dev() and return that error back to user * space. Later user space can inspect the progress from the structure * btrfs_ioctl_scrub_args and resume scrub from where it left off * previously (btrfs-progs does this). * If we fail to copy the btrfs_ioctl_scrub_args structure to user space * then return -EFAULT to signal the structure was not copied or it may * be corrupt and unreliable due to a partial copy. */ if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; if (!(sa->flags & BTRFS_SCRUB_READONLY)) mnt_drop_write_file(file); out: kfree(sa); return ret; } static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return btrfs_scrub_cancel(fs_info); } static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_scrub_args *sa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); return ret; } static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_get_dev_stats *sa; int ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) { kfree(sa); return -EPERM; } ret = btrfs_get_dev_stats(fs_info, sa); if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; kfree(sa); return ret; } static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_dev_replace_args *p; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_err(fs_info, "device replace not supported on extent tree v2 yet"); return -EINVAL; } p = memdup_user(arg, sizeof(*p)); if (IS_ERR(p)) return PTR_ERR(p); switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: if (sb_rdonly(fs_info->sb)) { ret = -EROFS; goto out; } if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: btrfs_dev_replace_status(fs_info, p); ret = 0; break; case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: p->result = btrfs_dev_replace_cancel(fs_info); ret = 0; break; default: ret = -EINVAL; break; } if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p))) ret = -EFAULT; out: kfree(p); return ret; } static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) { int ret = 0; int i; u64 rel_ptr; int size; struct btrfs_ioctl_ino_path_args *ipa = NULL; struct inode_fs_paths *ipath = NULL; struct btrfs_path *path; if (!capable(CAP_DAC_READ_SEARCH)) return -EPERM; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ipa = memdup_user(arg, sizeof(*ipa)); if (IS_ERR(ipa)) { ret = PTR_ERR(ipa); ipa = NULL; goto out; } size = min_t(u32, ipa->size, 4096); ipath = init_ipath(size, root, path); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; goto out; } ret = paths_from_inode(ipa->inum, ipath); if (ret < 0) goto out; for (i = 0; i < ipath->fspath->elem_cnt; ++i) { rel_ptr = ipath->fspath->val[i] - (u64)(unsigned long)ipath->fspath->val; ipath->fspath->val[i] = rel_ptr; } btrfs_free_path(path); path = NULL; ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; } out: btrfs_free_path(path); free_ipath(ipath); kfree(ipa); return ret; } static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { int ret = 0; int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) return -EPERM; loi = memdup_user(arg, sizeof(*loi)); if (IS_ERR(loi)) return PTR_ERR(loi); if (version == 1) { ignore_offset = false; size = min_t(u32, loi->size, SZ_64K); } else { /* All reserved bits must be 0 for now */ if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { ret = -EINVAL; goto out_loi; } /* Only accept flags we have defined so far */ if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { ret = -EINVAL; goto out_loi; } ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; size = min_t(u32, loi->size, SZ_16M); } inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); goto out_loi; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, inodes, ignore_offset); btrfs_free_path(path); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) goto out; ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, size); if (ret) ret = -EFAULT; out: kvfree(inodes); out_loi: kfree(loi); return ret; } void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; bargs->flags = bctl->flags; if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) bargs->state |= BTRFS_BALANCE_STATE_RUNNING; if (atomic_read(&fs_info->balance_pause_req)) bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ; if (atomic_read(&fs_info->balance_cancel_req)) bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ; memcpy(&bargs->data, &bctl->data, sizeof(bargs->data)); memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta)); memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys)); spin_lock(&fs_info->balance_lock); memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat)); spin_unlock(&fs_info->balance_lock); } /* * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as * required. * * @fs_info: the filesystem * @excl_acquired: ptr to boolean value which is set to false in case balance * is being resumed * * Return 0 on success in which case both fs_info::balance is acquired as well * as exclusive ops are blocked. In case of failure return an error code. */ static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) { int ret; /* * Exclusive operation is locked. Three possibilities: * (1) some other op is running * (2) balance is running * (3) balance is paused -- special case (think resume) */ while (1) { if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { *excl_acquired = true; mutex_lock(&fs_info->balance_mutex); return 0; } mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl) { /* This is either (2) or (3) */ if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { /* This is (2) */ ret = -EINPROGRESS; goto out_failure; } else { mutex_unlock(&fs_info->balance_mutex); /* * Lock released to allow other waiters to * continue, we'll reexamine the status again. */ mutex_lock(&fs_info->balance_mutex); if (fs_info->balance_ctl && !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { /* This is (3) */ *excl_acquired = false; return 0; } } } else { /* This is (1) */ ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_failure; } mutex_unlock(&fs_info->balance_mutex); } out_failure: mutex_unlock(&fs_info->balance_mutex); *excl_acquired = false; return ret; } static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; bool need_unlock = true; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); bargs = NULL; goto out; } ret = btrfs_try_lock_balance(fs_info, &need_unlock); if (ret) goto out; lockdep_assert_held(&fs_info->balance_mutex); if (bargs->flags & BTRFS_BALANCE_RESUME) { if (!fs_info->balance_ctl) { ret = -ENOTCONN; goto out_unlock; } bctl = fs_info->balance_ctl; spin_lock(&fs_info->balance_lock); bctl->flags |= BTRFS_BALANCE_RESUME; spin_unlock(&fs_info->balance_lock); btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); goto do_balance; } if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { ret = -EINVAL; goto out_unlock; } if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_unlock; } bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; goto out_unlock; } memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); bctl->flags = bargs->flags; do_balance: /* * Ownership of bctl and exclusive operation goes to btrfs_balance. * bctl is freed in reset_balance_state, or, if restriper was paused * all the way until unmount, in free_fs_info. The flag should be * cleared after reset_balance_state. */ need_unlock = false; ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; if (ret == 0 || ret == -ECANCELED) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } kfree(bctl); out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); kfree(bargs); return ret; } static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case BTRFS_BALANCE_CTL_PAUSE: return btrfs_pause_balance(fs_info); case BTRFS_BALANCE_CTL_CANCEL: return btrfs_cancel_balance(fs_info); } return -EINVAL; } static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_balance_args *bargs; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&fs_info->balance_mutex); if (!fs_info->balance_ctl) { ret = -ENOTCONN; goto out; } bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); if (!bargs) { ret = -ENOMEM; goto out; } btrfs_update_ioctl_balance_args(fs_info, bargs); if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; kfree(bargs); out: mutex_unlock(&fs_info->balance_mutex); return ret; } static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_ctl_args *sa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: down_write(&fs_info->subvol_sem); ret = btrfs_quota_enable(fs_info, sa); up_write(&fs_info->subvol_sem); break; case BTRFS_QUOTA_CTL_DISABLE: /* * Lock the cleaner mutex to prevent races with concurrent * relocation, because relocation may be building backrefs for * blocks of the quota root while we are deleting the root. This * is like dropping fs roots of deleted snapshots/subvolumes, we * need the same protection. * * This also prevents races between concurrent tasks trying to * disable quotas, because we will unlock and relock * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. * * We take this here because we have the dependency of * * inode_lock -> subvol_sem * * because of rename. With relocation we can prealloc extents, * so that makes the dependency chain * * cleaner_mutex -> inode_lock -> subvol_sem * * so we must take the cleaner_mutex here before we take the * subvol_sem. The deadlock can't actually happen, but this * quiets lockdep. */ mutex_lock(&fs_info->cleaner_mutex); down_write(&fs_info->subvol_sem); ret = btrfs_quota_disable(fs_info); up_write(&fs_info->subvol_sem); mutex_unlock(&fs_info->cleaner_mutex); break; default: ret = -EINVAL; break; } kfree(sa); drop_write: mnt_drop_write_file(file); return ret; } /* * Quick check for ioctl handlers if quotas are enabled. Proper locking must be * done before any operations. */ static bool qgroup_enabled(struct btrfs_fs_info *fs_info) { bool ret = true; mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) ret = false; mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_qgroup_list *prealloc = NULL; struct btrfs_trans_handle *trans; int ret; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } if (sa->assign) { prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { ret = -ENOMEM; goto drop_write; } } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } /* * Prealloc ownership is moved to the relation handler, there it's used * or freed on error. */ if (sa->assign) { ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc); prealloc = NULL; } else { ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst); } /* update qgroup status and info */ mutex_lock(&fs_info->qgroup_ioctl_lock); err = btrfs_run_qgroups(trans); mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) btrfs_warn(fs_info, "qgroup status update failed after %s relation, marked as inconsistent", sa->assign ? "adding" : "deleting"); err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: kfree(prealloc); kfree(sa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; int ret; int err; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } if (!sa->qgroupid) { ret = -EINVAL; goto out; } if (sa->create && is_fstree(sa->qgroupid)) { ret = -EINVAL; goto out; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } if (sa->create) { ret = btrfs_create_qgroup(trans, sa->qgroupid); } else { ret = btrfs_remove_qgroup(trans, sa->qgroupid); } err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: kfree(sa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; int ret; int err; u64 qgroupid; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!qgroup_enabled(root->fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); if (ret) return ret; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) { ret = PTR_ERR(sa); goto drop_write; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } qgroupid = sa->qgroupid; if (!qgroupid) { /* take the current subvol as qgroup */ qgroupid = btrfs_root_id(root); } ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim); err = btrfs_end_transaction(trans); if (err && !ret) ret = err; out: kfree(sa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!qgroup_enabled(fs_info)) return -ENOTCONN; ret = mnt_want_write_file(file); if (ret) return ret; qsa = memdup_user(arg, sizeof(*qsa)); if (IS_ERR(qsa)) { ret = PTR_ERR(qsa); goto drop_write; } if (qsa->flags) { ret = -EINVAL; goto out; } ret = btrfs_qgroup_rescan(fs_info); out: kfree(qsa); drop_write: mnt_drop_write_file(file); return ret; } static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_quota_rescan_args qsa = {0}; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { qsa.flags = 1; qsa.progress = fs_info->qgroup_rescan_progress.objectid; } if (copy_to_user(arg, &qsa, sizeof(qsa))) return -EFAULT; return 0; } static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return btrfs_qgroup_wait_for_completion(fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, struct mnt_idmap *idmap, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; struct timespec64 ct = current_time(inode); int ret = 0; int received_uuid_changed; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; ret = mnt_want_write_file(file); if (ret < 0) return ret; down_write(&fs_info->subvol_sem); if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; goto out; } if (btrfs_root_readonly(root)) { ret = -EROFS; goto out; } /* * 1 - root item * 2 - uuid items (received uuid + subvol uuid) */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } sa->rtransid = trans->transid; sa->rtime.sec = ct.tv_sec; sa->rtime.nsec = ct.tv_nsec; received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); if (received_uuid_changed && !btrfs_is_empty_uuid(root_item->received_uuid)) { ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } } memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); btrfs_set_root_stransid(root_item, sa->stransid); btrfs_set_root_rtransid(root_item, sa->rtransid); btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec); btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec); btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { btrfs_end_transaction(trans); goto out; } if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; } } ret = btrfs_commit_transaction(trans); out: up_write(&fs_info->subvol_sem); mnt_drop_write_file(file); return ret; } #ifdef CONFIG_64BIT static long btrfs_ioctl_set_received_subvol_32(struct file *file, void __user *arg) { struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL; struct btrfs_ioctl_received_subvol_args *args64 = NULL; int ret = 0; args32 = memdup_user(arg, sizeof(*args32)); if (IS_ERR(args32)) return PTR_ERR(args32); args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { ret = -ENOMEM; goto out; } memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE); args64->stransid = args32->stransid; args64->rtransid = args32->rtransid; args64->stime.sec = args32->stime.sec; args64->stime.nsec = args32->stime.nsec; args64->rtime.sec = args32->rtime.sec; args64->rtime.nsec = args32->rtime.nsec; args64->flags = args32->flags; ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64); if (ret) goto out; memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE); args32->stransid = args64->stransid; args32->rtransid = args64->rtransid; args32->stime.sec = args64->stime.sec; args32->stime.nsec = args64->stime.nsec; args32->rtime.sec = args64->rtime.sec; args32->rtime.nsec = args64->rtime.nsec; args32->flags = args64->flags; ret = copy_to_user(arg, args32, sizeof(*args32)); if (ret) ret = -EFAULT; out: kfree(args32); kfree(args64); return ret; } #endif static long btrfs_ioctl_set_received_subvol(struct file *file, void __user *arg) { struct btrfs_ioctl_received_subvol_args *sa = NULL; int ret = 0; sa = memdup_user(arg, sizeof(*sa)); if (IS_ERR(sa)) return PTR_ERR(sa); ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa); if (ret) goto out; ret = copy_to_user(arg, sa, sizeof(*sa)); if (ret) ret = -EFAULT; out: kfree(sa); return ret; } static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info, void __user *arg) { size_t len; int ret; char label[BTRFS_LABEL_SIZE]; spin_lock(&fs_info->super_lock); memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE); spin_unlock(&fs_info->super_lock); len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { btrfs_warn(fs_info, "label is too long, return the first %zu bytes", --len); } ret = copy_to_user(arg, label, len); return ret ? -EFAULT : 0; } static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; char label[BTRFS_LABEL_SIZE]; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(label, arg, sizeof(label))) return -EFAULT; if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { btrfs_err(fs_info, "unable to set label with more than %d bytes", BTRFS_LABEL_SIZE - 1); return -EINVAL; } ret = mnt_want_write_file(file); if (ret) return ret; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_unlock; } spin_lock(&fs_info->super_lock); strcpy(super_block->label, label); spin_unlock(&fs_info->super_lock); ret = btrfs_commit_transaction(trans); out_unlock: mnt_drop_write_file(file); return ret; } #define INIT_FEATURE_FLAGS(suffix) \ { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \ .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \ .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix } int btrfs_ioctl_get_supported_features(void __user *arg) { static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), INIT_FEATURE_FLAGS(SAFE_SET), INIT_FEATURE_FLAGS(SAFE_CLEAR) }; if (copy_to_user(arg, &features, sizeof(features))) return -EFAULT; return 0; } static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags features; features.compat_flags = btrfs_super_compat_flags(super_block); features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block); features.incompat_flags = btrfs_super_incompat_flags(super_block); if (copy_to_user(arg, &features, sizeof(features))) return -EFAULT; return 0; } static int check_feature_bits(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) { const char *type = btrfs_feature_set_name(set); char *names; u64 disallowed, unsupported; u64 set_mask = flags & change_mask; u64 clear_mask = ~flags & change_mask; unsupported = set_mask & ~supported_flags; if (unsupported) { names = btrfs_printable_features(set, unsupported); if (names) { btrfs_warn(fs_info, "this kernel does not support the %s feature bit%s", names, strchr(names, ',') ? "s" : ""); kfree(names); } else btrfs_warn(fs_info, "this kernel does not support %s bits 0x%llx", type, unsupported); return -EOPNOTSUPP; } disallowed = set_mask & ~safe_set; if (disallowed) { names = btrfs_printable_features(set, disallowed); if (names) { btrfs_warn(fs_info, "can't set the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); kfree(names); } else btrfs_warn(fs_info, "can't set %s bits 0x%llx while mounted", type, disallowed); return -EPERM; } disallowed = clear_mask & ~safe_clear; if (disallowed) { names = btrfs_printable_features(set, disallowed); if (names) { btrfs_warn(fs_info, "can't clear the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); kfree(names); } else btrfs_warn(fs_info, "can't clear %s bits 0x%llx while mounted", type, disallowed); return -EPERM; } return 0; } #define check_feature(fs_info, change_mask, flags, mask_base) \ check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ BTRFS_FEATURE_ ## mask_base ## _SUPP, \ BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; struct btrfs_trans_handle *trans; u64 newflags; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (copy_from_user(flags, arg, sizeof(flags))) return -EFAULT; /* Nothing to do */ if (!flags[0].compat_flags && !flags[0].compat_ro_flags && !flags[0].incompat_flags) return 0; ret = check_feature(fs_info, flags[0].compat_flags, flags[1].compat_flags, COMPAT); if (ret) return ret; ret = check_feature(fs_info, flags[0].compat_ro_flags, flags[1].compat_ro_flags, COMPAT_RO); if (ret) return ret; ret = check_feature(fs_info, flags[0].incompat_flags, flags[1].incompat_flags, INCOMPAT); if (ret) return ret; ret = mnt_want_write_file(file); if (ret) return ret; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_drop_write; } spin_lock(&fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); newflags |= flags[0].compat_flags & flags[1].compat_flags; newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); btrfs_set_super_compat_flags(super_block, newflags); newflags = btrfs_super_compat_ro_flags(super_block); newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags; newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags); btrfs_set_super_compat_ro_flags(super_block, newflags); newflags = btrfs_super_incompat_flags(super_block); newflags |= flags[0].incompat_flags & flags[1].incompat_flags; newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&fs_info->super_lock); ret = btrfs_commit_transaction(trans); out_drop_write: mnt_drop_write_file(file); return ret; } static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) return -EFAULT; arg = kzalloc(sizeof(*arg), GFP_KERNEL); if (!arg) return -ENOMEM; arg->send_fd = args32.send_fd; arg->clone_sources_count = args32.clone_sources_count; arg->clone_sources = compat_ptr(args32.clone_sources); arg->parent_root = args32.parent_root; arg->flags = args32.flags; arg->version = args32.version; memcpy(arg->reserved, args32.reserved, sizeof(args32.reserved)); #else return -ENOTTY; #endif } else { arg = memdup_user(argp, sizeof(*arg)); if (IS_ERR(arg)) return PTR_ERR(arg); } ret = btrfs_ioctl_send(inode, arg); kfree(arg); return ret; } static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, bool compat) { struct btrfs_ioctl_encoded_io_args args = { 0 }; size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; u64 disk_bytenr, disk_io_size; struct extent_state *cached_state = NULL; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out_acct; } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); if (copy_from_user(&args32, argp, copy_end)) { ret = -EFAULT; goto out_acct; } args.iov = compat_ptr(args32.iov); args.iovcnt = args32.iovcnt; args.offset = args32.offset; args.flags = args32.flags; #else return -ENOTTY; #endif } else { copy_end = copy_end_kernel; if (copy_from_user(&args, argp, copy_end)) { ret = -EFAULT; goto out_acct; } } if (args.flags != 0) { ret = -EINVAL; goto out_acct; } ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; if (iov_iter_count(&iter) == 0) { ret = 0; goto out_iov; } pos = args.offset; ret = rw_verify_area(READ, file, &pos, args.len); if (ret < 0) goto out_iov; init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state, &disk_bytenr, &disk_io_size); if (ret == -EIOCBQUEUED) { bool unlocked = false; u64 start, lockend, count; start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; if (args.compression) count = disk_io_size; else count = args.len; ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend, &cached_state, disk_bytenr, disk_io_size, count, args.compression, &unlocked); if (!unlocked) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } } if (ret >= 0) { fsnotify_access(file); if (copy_to_user(argp + copy_end, (char *)&args + copy_end_kernel, sizeof(args) - copy_end_kernel)) ret = -EFAULT; } out_iov: kfree(iov); out_acct: if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; loff_t pos; struct kiocb kiocb; ssize_t ret; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out_acct; } if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; if (copy_from_user(&args32, argp, sizeof(args32))) { ret = -EFAULT; goto out_acct; } args.iov = compat_ptr(args32.iov); args.iovcnt = args32.iovcnt; args.offset = args32.offset; args.flags = args32.flags; args.len = args32.len; args.unencoded_len = args32.unencoded_len; args.unencoded_offset = args32.unencoded_offset; args.compression = args32.compression; args.encryption = args32.encryption; memcpy(args.reserved, args32.reserved, sizeof(args.reserved)); #else return -ENOTTY; #endif } else { if (copy_from_user(&args, argp, sizeof(args))) { ret = -EFAULT; goto out_acct; } } ret = -EINVAL; if (args.flags != 0) goto out_acct; if (memchr_inv(args.reserved, 0, sizeof(args.reserved))) goto out_acct; if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) goto out_acct; if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) goto out_acct; if (args.unencoded_offset > args.unencoded_len) goto out_acct; if (args.len > args.unencoded_len - args.unencoded_offset) goto out_acct; ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out_acct; if (iov_iter_count(&iter) == 0) { ret = 0; goto out_iov; } pos = args.offset; ret = rw_verify_area(WRITE, file, &pos, args.len); if (ret < 0) goto out_iov; init_sync_kiocb(&kiocb, file); ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; file_start_write(file); ret = btrfs_do_write_iter(&kiocb, &iter, &args); if (ret > 0) fsnotify_modify(file); file_end_write(file); out_iov: kfree(iov); out_acct: if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } /* * Context that's attached to an encoded read io_uring command, in cmd->pdu. It * contains the fields in btrfs_uring_read_extent that are necessary to finish * off and cleanup the I/O in btrfs_uring_read_finished. */ struct btrfs_uring_priv { struct io_uring_cmd *cmd; struct page **pages; unsigned long nr_pages; struct kiocb iocb; struct iovec *iov; struct iov_iter iter; struct extent_state *cached_state; u64 count; u64 start; u64 lockend; int err; bool compressed; }; struct io_btrfs_cmd { struct btrfs_uring_priv *priv; }; static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); struct btrfs_uring_priv *priv = bc->priv; struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; unsigned long index; u64 cur; size_t page_offset; ssize_t ret; /* The inode lock has already been acquired in btrfs_uring_read_extent. */ btrfs_lockdep_inode_acquire(inode, i_rwsem); if (priv->err) { ret = priv->err; goto out; } if (priv->compressed) { index = 0; page_offset = 0; } else { index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT; page_offset = offset_in_page(priv->iocb.ki_pos - priv->start); } cur = 0; while (cur < priv->count) { size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset); if (copy_page_to_iter(priv->pages[index], page_offset, bytes, &priv->iter) != bytes) { ret = -EFAULT; goto out; } index++; cur += bytes; page_offset = 0; } ret = priv->count; out: unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); io_uring_cmd_done(cmd, ret, 0, issue_flags); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) __free_page(priv->pages[index]); kfree(priv->pages); kfree(priv->iov); kfree(priv); } void btrfs_uring_read_extent_endio(void *ctx, int err) { struct btrfs_uring_priv *priv = ctx; struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd); priv->err = err; bc->priv = priv; io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished); } static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, u64 start, u64 lockend, struct extent_state *cached_state, u64 disk_bytenr, u64 disk_io_size, size_t count, bool compressed, struct iovec *iov, struct io_uring_cmd *cmd) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; struct page **pages; struct btrfs_uring_priv *priv = NULL; unsigned long nr_pages; int ret; nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; ret = btrfs_alloc_page_array(nr_pages, pages, 0); if (ret) { ret = -ENOMEM; goto out_fail; } priv = kmalloc(sizeof(*priv), GFP_NOFS); if (!priv) { ret = -ENOMEM; goto out_fail; } priv->iocb = *iocb; priv->iov = iov; priv->iter = *iter; priv->count = count; priv->cmd = cmd; priv->cached_state = cached_state; priv->compressed = compressed; priv->nr_pages = nr_pages; priv->pages = pages; priv->start = start; priv->lockend = lockend; priv->err = 0; ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_io_size, pages, priv); if (ret && ret != -EIOCBQUEUED) goto out_fail; /* * If we return -EIOCBQUEUED, we're deferring the cleanup to * btrfs_uring_read_finished(), which will handle unlocking the extent * and inode and freeing the allocations. */ /* * We're returning to userspace with the inode lock held, and that's * okay - it'll get unlocked in a worker thread. Call * btrfs_lockdep_inode_release() to avoid confusing lockdep. */ btrfs_lockdep_inode_release(inode, i_rwsem); return -EIOCBQUEUED; out_fail: unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); kfree(priv); return ret; } struct btrfs_uring_encoded_data { struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov; struct iov_iter iter; }; static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; int ret; u64 disk_bytenr, disk_io_size; struct file *file; struct btrfs_inode *inode; struct btrfs_fs_info *fs_info; struct extent_io_tree *io_tree; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; u64 start, lockend; void __user *sqe_addr; struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out_acct; } file = cmd->file; inode = BTRFS_I(file->f_inode); fs_info = inode->root->fs_info; io_tree = &inode->io_tree; sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags); #else return -ENOTTY; #endif } else { copy_end = copy_end_kernel; } if (!data) { data = kzalloc(sizeof(*data), GFP_NOFS); if (!data) { ret = -ENOMEM; goto out_acct; } io_uring_cmd_get_async_data(cmd)->op_data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; if (copy_from_user(&args32, sqe_addr, copy_end)) { ret = -EFAULT; goto out_acct; } data->args.iov = compat_ptr(args32.iov); data->args.iovcnt = args32.iovcnt; data->args.offset = args32.offset; data->args.flags = args32.flags; #endif } else { if (copy_from_user(&data->args, sqe_addr, copy_end)) { ret = -EFAULT; goto out_acct; } } if (data->args.flags != 0) { ret = -EINVAL; goto out_acct; } data->iov = data->iovstack; ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt, ARRAY_SIZE(data->iovstack), &data->iov, &data->iter); if (ret < 0) goto out_acct; if (iov_iter_count(&data->iter) == 0) { ret = 0; goto out_free; } } pos = data->args.offset; ret = rw_verify_area(READ, file, &pos, data->args.len); if (ret < 0) goto out_free; init_sync_kiocb(&kiocb, file); kiocb.ki_pos = pos; if (issue_flags & IO_URING_F_NONBLOCK) kiocb.ki_flags |= IOCB_NOWAIT; start = ALIGN_DOWN(pos, fs_info->sectorsize); lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; file_accessed(file); if (copy_to_user(sqe_addr + copy_end, (const char *)&data->args + copy_end_kernel, sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } ret = -EFAULT; goto out_free; } if (ret == -EIOCBQUEUED) { u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size); /* Match ioctl by not returning past EOF if uncompressed. */ if (!data->args.compression) count = min_t(u64, count, data->args.len); ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend, cached_state, disk_bytenr, disk_io_size, count, data->args.compression, data->iov, cmd); goto out_acct; } out_free: kfree(data->iov); out_acct: if (ret > 0) add_rchar(current, ret); inc_syscr(current); return ret; } static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { loff_t pos; struct kiocb kiocb; struct file *file; ssize_t ret; void __user *sqe_addr; struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data; if (!capable(CAP_SYS_ADMIN)) { ret = -EPERM; goto out_acct; } file = cmd->file; sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; } if (!data) { data = kzalloc(sizeof(*data), GFP_NOFS); if (!data) { ret = -ENOMEM; goto out_acct; } io_uring_cmd_get_async_data(cmd)->op_data = data; if (issue_flags & IO_URING_F_COMPAT) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; if (copy_from_user(&args32, sqe_addr, sizeof(args32))) { ret = -EFAULT; goto out_acct; } data->args.iov = compat_ptr(args32.iov); data->args.iovcnt = args32.iovcnt; data->args.offset = args32.offset; data->args.flags = args32.flags; data->args.len = args32.len; data->args.unencoded_len = args32.unencoded_len; data->args.unencoded_offset = args32.unencoded_offset; data->args.compression = args32.compression; data->args.encryption = args32.encryption; memcpy(data->args.reserved, args32.reserved, sizeof(data->args.reserved)); #else ret = -ENOTTY; goto out_acct; #endif } else { if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) { ret = -EFAULT; goto out_acct; } } ret = -EINVAL; if (data->args.flags != 0) goto out_acct; if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved))) goto out_acct; if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE && data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE) goto out_acct; if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES || data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES) goto out_acct; if (data->args.unencoded_offset > data->args.unencoded_len) goto out_acct; if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset) goto out_acct; data->iov = data->iovstack; ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt, ARRAY_SIZE(data->iovstack), &data->iov, &data->iter); if (ret < 0) goto out_acct; if (iov_iter_count(&data->iter) == 0) { ret = 0; goto out_iov; } } if (issue_flags & IO_URING_F_NONBLOCK) { ret = -EAGAIN; goto out_acct; } pos = data->args.offset; ret = rw_verify_area(WRITE, file, &pos, data->args.len); if (ret < 0) goto out_iov; init_sync_kiocb(&kiocb, file); ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; file_start_write(file); ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args); if (ret > 0) fsnotify_modify(file); file_end_write(file); out_iov: kfree(data->iov); out_acct: if (ret > 0) add_wchar(current, ret); inc_syscw(current); return ret; } int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { switch (cmd->cmd_op) { case BTRFS_IOC_ENCODED_READ: #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_ENCODED_READ_32: #endif return btrfs_uring_encoded_read(cmd, issue_flags); case BTRFS_IOC_ENCODED_WRITE: #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_ENCODED_WRITE_32: #endif return btrfs_uring_encoded_write(cmd, issue_flags); } return -EINVAL; } static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp) { struct btrfs_root *root; struct btrfs_ioctl_subvol_wait args = { 0 }; signed long sched_ret; int refs; u64 root_flags; bool wait_for_deletion = false; bool found = false; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; switch (args.mode) { case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED: /* * Wait for the first one deleted that waits until all previous * are cleaned. */ spin_lock(&fs_info->trans_lock); if (!list_empty(&fs_info->dead_roots)) { root = list_last_entry(&fs_info->dead_roots, struct btrfs_root, root_list); args.subvolid = btrfs_root_id(root); found = true; } spin_unlock(&fs_info->trans_lock); if (!found) return -ENOENT; fallthrough; case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE: if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) || BTRFS_LAST_FREE_OBJECTID < args.subvolid) return -EINVAL; break; case BTRFS_SUBVOL_SYNC_COUNT: spin_lock(&fs_info->trans_lock); args.count = list_count_nodes(&fs_info->dead_roots); spin_unlock(&fs_info->trans_lock); if (copy_to_user(argp, &args, sizeof(args))) return -EFAULT; return 0; case BTRFS_SUBVOL_SYNC_PEEK_FIRST: spin_lock(&fs_info->trans_lock); /* Last in the list was deleted first. */ if (!list_empty(&fs_info->dead_roots)) { root = list_last_entry(&fs_info->dead_roots, struct btrfs_root, root_list); args.subvolid = btrfs_root_id(root); } else { args.subvolid = 0; } spin_unlock(&fs_info->trans_lock); if (copy_to_user(argp, &args, sizeof(args))) return -EFAULT; return 0; case BTRFS_SUBVOL_SYNC_PEEK_LAST: spin_lock(&fs_info->trans_lock); /* First in the list was deleted last. */ if (!list_empty(&fs_info->dead_roots)) { root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); args.subvolid = btrfs_root_id(root); } else { args.subvolid = 0; } spin_unlock(&fs_info->trans_lock); if (copy_to_user(argp, &args, sizeof(args))) return -EFAULT; return 0; default: return -EINVAL; } /* 32bit limitation: fs_roots_radix key is not wide enough. */ if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX) return -EOVERFLOW; while (1) { /* Wait for the specific one. */ if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR) return -EINTR; refs = -1; spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)args.subvolid); if (root) { spin_lock(&root->root_item_lock); refs = btrfs_root_refs(&root->root_item); root_flags = btrfs_root_flags(&root->root_item); spin_unlock(&root->root_item_lock); } spin_unlock(&fs_info->fs_roots_radix_lock); up_read(&fs_info->subvol_sem); /* Subvolume does not exist. */ if (!root) return -ENOENT; /* Subvolume not deleted at all. */ if (refs > 0) return -EEXIST; /* We've waited and now the subvolume is gone. */ if (wait_for_deletion && refs == -1) { /* Return the one we waited for as the last one. */ if (copy_to_user(argp, &args, sizeof(args))) return -EFAULT; return 0; } /* Subvolume not found on the first try (deleted or never existed). */ if (refs == -1) return -ENOENT; wait_for_deletion = true; ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD); sched_ret = schedule_timeout_interruptible(HZ); /* Early wake up or error. */ if (sched_ret != 0) return -EINTR; } return 0; } long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; switch (cmd) { case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(inode, argp); case FS_IOC_GETFSLABEL: return btrfs_ioctl_get_fslabel(fs_info, argp); case FS_IOC_SETFSLABEL: return btrfs_ioctl_set_fslabel(file, argp); case FITRIM: return btrfs_ioctl_fitrim(fs_info, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_SUBVOL_CREATE_V2: return btrfs_ioctl_snap_create_v2(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp, false); case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: return btrfs_ioctl_subvol_getflags(inode, argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: return btrfs_ioctl_defrag(file, NULL); case BTRFS_IOC_DEFRAG_RANGE: return btrfs_ioctl_defrag(file, argp); case BTRFS_IOC_RESIZE: return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: return btrfs_ioctl_add_dev(fs_info, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); case BTRFS_IOC_RM_DEV_V2: return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_TREE_SEARCH: return btrfs_ioctl_tree_search(inode, argp); case BTRFS_IOC_TREE_SEARCH_V2: return btrfs_ioctl_tree_search_v2(inode, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); case BTRFS_IOC_LOGICAL_INO_V2: return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(fs_info, argp); case BTRFS_IOC_SYNC: { int ret; ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); /* * There may be work for the cleaner kthread to do (subvolume * deletion, delayed iputs, defrag inodes, etc), so wake it up. */ wake_up_process(fs_info->cleaner_kthread); return ret; } case BTRFS_IOC_START_SYNC: return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: return btrfs_ioctl_wait_sync(fs_info, argp); case BTRFS_IOC_SCRUB: return btrfs_ioctl_scrub(file, argp); case BTRFS_IOC_SCRUB_CANCEL: return btrfs_ioctl_scrub_cancel(fs_info); case BTRFS_IOC_SCRUB_PROGRESS: return btrfs_ioctl_scrub_progress(fs_info, argp); case BTRFS_IOC_BALANCE_V2: return btrfs_ioctl_balance(file, argp); case BTRFS_IOC_BALANCE_CTL: return btrfs_ioctl_balance_ctl(fs_info, arg); case BTRFS_IOC_BALANCE_PROGRESS: return btrfs_ioctl_balance_progress(fs_info, argp); case BTRFS_IOC_SET_RECEIVED_SUBVOL: return btrfs_ioctl_set_received_subvol(file, argp); #ifdef CONFIG_64BIT case BTRFS_IOC_SET_RECEIVED_SUBVOL_32: return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); case BTRFS_IOC_QUOTA_CTL: return btrfs_ioctl_quota_ctl(file, argp); case BTRFS_IOC_QGROUP_ASSIGN: return btrfs_ioctl_qgroup_assign(file, argp); case BTRFS_IOC_QGROUP_CREATE: return btrfs_ioctl_qgroup_create(file, argp); case BTRFS_IOC_QGROUP_LIMIT: return btrfs_ioctl_qgroup_limit(file, argp); case BTRFS_IOC_QUOTA_RESCAN: return btrfs_ioctl_quota_rescan(file, argp); case BTRFS_IOC_QUOTA_RESCAN_STATUS: return btrfs_ioctl_quota_rescan_status(fs_info, argp); case BTRFS_IOC_QUOTA_RESCAN_WAIT: return btrfs_ioctl_quota_rescan_wait(fs_info); case BTRFS_IOC_DEV_REPLACE: return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(argp); case BTRFS_IOC_GET_FEATURES: return btrfs_ioctl_get_features(fs_info, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: return btrfs_ioctl_get_subvol_info(inode, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: return btrfs_ioctl_get_subvol_rootref(root, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); case FS_IOC_ENABLE_VERITY: return fsverity_ioctl_enable(file, (const void __user *)argp); case FS_IOC_MEASURE_VERITY: return fsverity_ioctl_measure(file, argp); case FS_IOC_READ_VERITY_METADATA: return fsverity_ioctl_read_metadata(file, argp); case BTRFS_IOC_ENCODED_READ: return btrfs_ioctl_encoded_read(file, argp, false); case BTRFS_IOC_ENCODED_WRITE: return btrfs_ioctl_encoded_write(file, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_ENCODED_READ_32: return btrfs_ioctl_encoded_read(file, argp, true); case BTRFS_IOC_ENCODED_WRITE_32: return btrfs_ioctl_encoded_write(file, argp, true); #endif case BTRFS_IOC_SUBVOL_SYNC_WAIT: return btrfs_ioctl_subvol_sync(fs_info, argp); } return -ENOTTY; } #ifdef CONFIG_COMPAT long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* * These all access 32-bit values anyway so no further * handling is necessary. */ switch (cmd) { case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; } return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif
3453 3658 3169 3172 1 2 1 2 2 1 1 7 1 3 2 1 3178 3175 3180 3184 4 3172 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; struct filename *name; int err, index; name = getname(__name); err = PTR_ERR(name); if (IS_ERR(name)) return err; err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name->name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); putname(name); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) if (index <= 0 && try_module_get(tmp->owner)) break; read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type);
50 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC security handling * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/crypto.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include "ar-internal.h" static const struct rxrpc_security *rxrpc_security_types[] = { [RXRPC_SECURITY_NONE] = &rxrpc_no_security, #ifdef CONFIG_RXKAD [RXRPC_SECURITY_RXKAD] = &rxkad, #endif }; int __init rxrpc_init_security(void) { int i, ret; for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) { if (rxrpc_security_types[i]) { ret = rxrpc_security_types[i]->init(); if (ret < 0) goto failed; } } return 0; failed: for (i--; i >= 0; i--) if (rxrpc_security_types[i]) rxrpc_security_types[i]->exit(); return ret; } void rxrpc_exit_security(void) { int i; for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) if (rxrpc_security_types[i]) rxrpc_security_types[i]->exit(); } /* * look up an rxrpc security module */ const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) { if (security_index >= ARRAY_SIZE(rxrpc_security_types)) return NULL; return rxrpc_security_types[security_index]; } /* * Initialise the security on a client call. */ int rxrpc_init_client_call_security(struct rxrpc_call *call) { const struct rxrpc_security *sec = &rxrpc_no_security; struct rxrpc_key_token *token; struct key *key = call->key; int ret; if (!key) goto found; ret = key_validate(key); if (ret < 0) return ret; for (token = key->payload.data[0]; token; token = token->next) { sec = rxrpc_security_lookup(token->security_index); if (sec) goto found; } return -EKEYREJECTED; found: call->security = sec; call->security_ix = sec->security_index; return 0; } /* * initialise the security on a client connection */ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) { struct rxrpc_key_token *token; struct key *key = conn->key; int ret = 0; _enter("{%d},{%x}", conn->debug_id, key_serial(key)); for (token = key->payload.data[0]; token; token = token->next) { if (token->security_index == conn->security->security_index) goto found; } return -EKEYREJECTED; found: mutex_lock(&conn->security_lock); if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = conn->security->init_connection_security(conn, token); if (ret == 0) { spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) conn->state = RXRPC_CONN_CLIENT; spin_unlock_irq(&conn->state_lock); } } mutex_unlock(&conn->security_lock); return ret; } /* * Set the ops a server connection. */ const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *rx, struct sk_buff *skb) { const struct rxrpc_security *sec; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); _enter(""); sec = rxrpc_security_lookup(sp->hdr.securityIndex); if (!sec) { rxrpc_direct_abort(skb, rxrpc_abort_unsupported_security, RX_INVALID_OPERATION, -EKEYREJECTED); return NULL; } if (sp->hdr.securityIndex != RXRPC_SECURITY_NONE && !rx->securities) { rxrpc_direct_abort(skb, rxrpc_abort_no_service_key, sec->no_key_abort, -EKEYREJECTED); return NULL; } return sec; } /* * Find the security key for a server connection. */ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, struct sk_buff *skb, u32 kvno, u32 enctype) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_sock *rx; struct key *key = ERR_PTR(-EKEYREJECTED); key_ref_t kref = NULL; char kdesc[5 + 1 + 3 + 1 + 12 + 1 + 12 + 1]; int ret; _enter(""); if (enctype) sprintf(kdesc, "%u:%u:%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex, kvno, enctype); else if (kvno) sprintf(kdesc, "%u:%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex, kvno); else sprintf(kdesc, "%u:%u", sp->hdr.serviceId, sp->hdr.securityIndex); read_lock(&conn->local->services_lock); rx = conn->local->service; if (!rx) goto out; /* look through the service's keyring */ kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc, true); if (IS_ERR(kref)) { key = ERR_CAST(kref); goto out; } key = key_ref_to_ptr(kref); ret = key_validate(key); if (ret < 0) { key_put(key); key = ERR_PTR(ret); goto out; } out: read_unlock(&conn->local->services_lock); return key; }
38 37 10 10 3 3 3 3 38 37 37 35 38 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> #include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <linux/mmdebug.h> #include <linux/pagewalk.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" /** * struct vmemmap_remap_walk - walk vmemmap page table * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. * @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); unsigned long nr_walked; struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, struct vmemmap_remap_walk *walk) { pmd_t __pmd; int i; unsigned long addr = start; pte_t *pgtable; pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; pmd_populate_kernel(&init_mm, &__pmd, pgtable); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to * be treated as indepdenent small pages (as they can be freed * individually). */ if (!PageReserved(head)) split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } spin_unlock(&init_mm.page_table_lock); return 0; } static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { int ret = 0; struct page *head; struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* Only splitting, not remapping the vmemmap pages. */ if (!vmemmap_walk->remap_pte) walk->action = ACTION_CONTINUE; spin_lock(&init_mm.page_table_lock); head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; /* * Due to HugeTLB alignment requirements and the vmemmap * pages being at the start of the hotplugged memory * region in memory_hotplug.memmap_on_memory case. Checking * the vmemmap page associated with the first vmemmap page * if it is self-hosted is sufficient. * * [ hotplugged memory ] * [ section ][...][ section ] * [ vmemmap ][ usable memory ] * ^ | ^ | * +--+ | | * +------------------------+ */ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { struct page *page = head ? head + pte_index(addr) : pte_page(ptep_get(pte_offset_kernel(pmd, addr))); if (PageVmemmapSelfHosted(page)) ret = -ENOTSUPP; } spin_unlock(&init_mm.page_table_lock); if (!head || ret) return ret; return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); } static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* * The reuse_page is found 'first' in page table walking before * starting remapping. */ if (!vmemmap_walk->reuse_page) vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); else vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; } static const struct mm_walk_ops vmemmap_remap_ops = { .pmd_entry = vmemmap_pmd_entry, .pte_entry = vmemmap_pte_entry, }; static int vmemmap_remap_range(unsigned long start, unsigned long end, struct vmemmap_remap_walk *walk) { int ret; VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; } /* * Free a vmemmap page. A vmemmap page can be allocated from the memblock * allocator or buddy allocator. If the PG_reserved flag is set, it means * that it allocated from the memblock allocator, just free it via the * free_bootmem_page(). Otherwise, use __free_page(). */ static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) { memmap_boot_pages_add(-1); free_bootmem_page(page); } else { memmap_pages_add(-1); __free_page(page); } } /* Free a list of the vmemmap pages */ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { /* * Remap the tail pages as read-only to catch illegal write operation * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ if (unlikely(addr == walk->reuse_addr)) { pgprot = PAGE_KERNEL; list_del(&walk->reuse_page->lru); /* * Makes sure that preceding stores to the page contents from * vmemmap_remap_free() become visible before the set_pte_at() * write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } /* * How many struct page structs need to be reset. When we reuse the head * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message * of "corrupted mapping in tail page". We need to reset at least 3 (one * head struct page struct and two tail struct page structs) struct page * structs. */ #define NR_RESET_STRUCT_PAGE 3 static inline void reset_struct_pages(struct page *start) { struct page *from = start + NR_RESET_STRUCT_PAGE; BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { pgprot_t pgprot = PAGE_KERNEL; struct page *page; void *to; BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } /** * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) * backing PMDs of the directmap into PTEs * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_split(unsigned long start, unsigned long end, unsigned long reuse) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); return vmemmap_remap_range(reuse, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap * which the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = flags, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* * Allocate a new head vmemmap page to avoid breaking a contiguous * block of struct page memory when freeing it back to page allocator * in free_vmemmap_page_list(). This will allow the likely contiguous * struct page backing memory to be kept contiguous and allowing for * more allocations of hugepages. Fallback to the currently * mapped head page in case should it fail to allocate. */ walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); if (walk.reuse_page) { copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); memmap_pages_add(1); } /* * In order to make remapping routine most efficient for the huge pages, * the routine of vmemmap page table walking has the following rules * (see more details from the vmemmap_pte_range()): * * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) * should be continuous. * - The @reuse address is part of the range [@reuse, @end) that we are * walking which is passed to vmemmap_remap_range(). * - The @reuse address is the first in the complete range. * * So we need to make sure that @start and @reuse meet the above rules. */ BUG_ON(start - reuse != PAGE_SIZE); ret = vmemmap_remap_range(reuse, end, &walk); if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* * vmemmap_pages contains pages from the previous * vmemmap_remap_range call which failed. These * are pages which were removed from the vmemmap. * They will be restored in the following call. */ walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); } return ret; } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { gfp