Total coverage: 111769 (7%)of 1852157
3585 222 5431 5910 8 12 68 3521 6560 6585 203 205 8 8 211 211 211 5 5 311 311 3885 3895 1 3519 3521 3527 115 115 294 296 3963 3930 31 3780 3782 301 301 2305 2307 173 175 179 179 4217 4225 19 19 2305 2309 2308 34 34 16 16 5910 5910 5926 2426 2436 2429 18 18 1082 8 1077 1088 1085 8 1077 1079 41 41 1300 1302 50 50 479 479 146 147 221 221 241 245 122 122 584 587 587 269 271 56 1 56 153 1 152 8 8 1244 1246 126 126 138 139 301 303 238 241 556 557 142 144 125 125 663 664 600 67 664 29 29 262 264 8357 8383 865 2 864 735 2 733 307 1 307 126 126 125 126 93 93 79 79 114 114 198 199 38 38 79 79 79 79 79 79 1598 1602 102 102 39 39 26 26 1809 1810 4455 4486 5432 5431 5449 2258 2270 4341 4355 4333 3532 3544 66 155 220 220 220 222 59 59 195 202 26 26 43 44 4797 4809 4538 4547 260 260 8 8 8 13 13 222 222 222 76 76 4728 4739 144 145 151 151 6184 6182 6162 4 4 4 4 189 189 189 189 4 4 4 4 1959 1967 5 5 5 5 5 5 12 12 12 22 22 35 35 69 68 69 14 14 64 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 // SPDX-License-Identifier: GPL-2.0-or-later /* * Security plug functions * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2016 Mellanox Technologies * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com> */ #define pr_fmt(fmt) "LSM: " fmt #include <linux/bpf.h> #include <linux/capability.h> #include <linux/dcache.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/lsm_hooks.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/backing-dev.h> #include <linux/string.h> #include <linux/xattr.h> #include <linux/msg.h> #include <linux/overflow.h> #include <linux/perf_event.h> #include <linux/fs.h> #include <net/flow.h> #include <net/sock.h> #define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX /* * Identifier for the LSM static calls. * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT */ #define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX /* * Call the macro M for each LSM hook MAX_LSM_COUNT times. */ #define LSM_LOOP_UNROLL(M, ...) \ do { \ UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) \ } while (0) #define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__) /* * These are descriptions of the reasons that can be passed to the * security_locked_down() LSM hook. Placing this array here allows * all security modules to use the same descriptions for auditing * purposes. */ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_EFI_TEST] = "/dev/efi_test access", [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM", [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM", [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); static struct kmem_cache *lsm_file_cache; static struct kmem_cache *lsm_inode_cache; char *lsm_names; static struct lsm_blob_sizes blob_sizes __ro_after_init; /* Boot-time LSM user choice */ static __initdata const char *chosen_lsm_order; static __initdata const char *chosen_major_lsm; static __initconst const char *const builtin_lsm_order = CONFIG_LSM; /* Ordered list of LSMs to initialize. */ static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1]; static __initdata struct lsm_info *exclusive; #ifdef CONFIG_HAVE_STATIC_CALL #define LSM_HOOK_TRAMP(NAME, NUM) \ &STATIC_CALL_TRAMP(LSM_STATIC_CALL(NAME, NUM)) #else #define LSM_HOOK_TRAMP(NAME, NUM) NULL #endif /* * Define static calls and static keys for each LSM hook. */ #define DEFINE_LSM_STATIC_CALL(NUM, NAME, RET, ...) \ DEFINE_STATIC_CALL_NULL(LSM_STATIC_CALL(NAME, NUM), \ *((RET(*)(__VA_ARGS__))NULL)); \ DEFINE_STATIC_KEY_FALSE(SECURITY_HOOK_ACTIVE_KEY(NAME, NUM)); #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ LSM_DEFINE_UNROLL(DEFINE_LSM_STATIC_CALL, NAME, RET, __VA_ARGS__) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK #undef DEFINE_LSM_STATIC_CALL /* * Initialise a table of static calls for each LSM hook. * DEFINE_STATIC_CALL_NULL invocation above generates a key (STATIC_CALL_KEY) * and a trampoline (STATIC_CALL_TRAMP) which are used to call * __static_call_update when updating the static call. * * The static calls table is used by early LSMs, some architectures can fault on * unaligned accesses and the fault handling code may not be ready by then. * Thus, the static calls table should be aligned to avoid any unhandled faults * in early init. */ struct lsm_static_calls_table static_calls_table __ro_after_init __aligned(sizeof(u64)) = { #define INIT_LSM_STATIC_CALL(NUM, NAME) \ (struct lsm_static_call) { \ .key = &STATIC_CALL_KEY(LSM_STATIC_CALL(NAME, NUM)), \ .trampoline = LSM_HOOK_TRAMP(NAME, NUM), \ .active = &SECURITY_HOOK_ACTIVE_KEY(NAME, NUM), \ }, #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ .NAME = { \ LSM_DEFINE_UNROLL(INIT_LSM_STATIC_CALL, NAME) \ }, #include <linux/lsm_hook_defs.h> #undef LSM_HOOK #undef INIT_LSM_STATIC_CALL }; static __initdata bool debug; #define init_debug(...) \ do { \ if (debug) \ pr_info(__VA_ARGS__); \ } while (0) static bool __init is_enabled(struct lsm_info *lsm) { if (!lsm->enabled) return false; return *lsm->enabled; } /* Mark an LSM's enabled flag. */ static int lsm_enabled_true __initdata = 1; static int lsm_enabled_false __initdata = 0; static void __init set_enabled(struct lsm_info *lsm, bool enabled) { /* * When an LSM hasn't configured an enable variable, we can use * a hard-coded location for storing the default enabled state. */ if (!lsm->enabled) { if (enabled) lsm->enabled = &lsm_enabled_true; else lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_true) { if (!enabled) lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_false) { if (enabled) lsm->enabled = &lsm_enabled_true; } else { *lsm->enabled = enabled; } } /* Is an LSM already listed in the ordered LSMs list? */ static bool __init exists_ordered_lsm(struct lsm_info *lsm) { struct lsm_info **check; for (check = ordered_lsms; *check; check++) if (*check == lsm) return true; return false; } /* Append an LSM to the list of ordered LSMs to initialize. */ static int last_lsm __initdata; static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) { /* Ignore duplicate selections. */ if (exists_ordered_lsm(lsm)) return; if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from)) return; /* Enable this LSM, if it is not already set. */ if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; ordered_lsms[last_lsm++] = lsm; init_debug("%s ordered: %s (%s)\n", from, lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); } /* Is an LSM allowed to be initialized? */ static bool __init lsm_allowed(struct lsm_info *lsm) { /* Skip if the LSM is disabled. */ if (!is_enabled(lsm)) return false; /* Not allowed if another exclusive LSM already initialized. */ if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { init_debug("exclusive disabled: %s\n", lsm->name); return false; } return true; } static void __init lsm_set_blob_size(int *need, int *lbs) { int offset; if (*need <= 0) return; offset = ALIGN(*lbs, sizeof(void *)); *lbs = offset + *need; *need = offset; } static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) { if (!needed) return; lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib); /* * The inode blob gets an rcu_head in addition to * what the modules might need. */ if (needed->lbs_inode && blob_sizes.lbs_inode == 0) blob_sizes.lbs_inode = sizeof(struct rcu_head); lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode); lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc); lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key); lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg); lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event); lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock); lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock); lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task); lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev); lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); } /* Prepare LSM for initialization. */ static void __init prepare_lsm(struct lsm_info *lsm) { int enabled = lsm_allowed(lsm); /* Record enablement (to handle any following exclusive LSMs). */ set_enabled(lsm, enabled); /* If enabled, do pre-initialization work. */ if (enabled) { if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { exclusive = lsm; init_debug("exclusive chosen: %s\n", lsm->name); } lsm_set_blob_sizes(lsm->blobs); } } /* Initialize a given LSM, if it is enabled. */ static void __init initialize_lsm(struct lsm_info *lsm) { if (is_enabled(lsm)) { int ret; init_debug("initializing %s\n", lsm->name); ret = lsm->init(); WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); } } /* * Current index to use while initializing the lsm id list. */ u32 lsm_active_cnt __ro_after_init; const struct lsm_id *lsm_idlist[MAX_LSM_COUNT]; /* Populate ordered LSMs list from comma-separated LSM name list. */ static void __init ordered_lsm_parse(const char *order, const char *origin) { struct lsm_info *lsm; char *sep, *name, *next; /* LSM_ORDER_FIRST is always first. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_FIRST) append_ordered_lsm(lsm, " first"); } /* Process "security=", if given. */ if (chosen_major_lsm) { struct lsm_info *major; /* * To match the original "security=" behavior, this * explicitly does NOT fallback to another Legacy Major * if the selected one was separately disabled: disable * all non-matching Legacy Major LSMs. */ for (major = __start_lsm_info; major < __end_lsm_info; major++) { if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && strcmp(major->name, chosen_major_lsm) != 0) { set_enabled(major, false); init_debug("security=%s disabled: %s (only one legacy major LSM)\n", chosen_major_lsm, major->name); } } } sep = kstrdup(order, GFP_KERNEL); next = sep; /* Walk the list, looking for matching LSMs. */ while ((name = strsep(&next, ",")) != NULL) { bool found = false; for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (strcmp(lsm->name, name) == 0) { if (lsm->order == LSM_ORDER_MUTABLE) append_ordered_lsm(lsm, origin); found = true; } } if (!found) init_debug("%s ignored: %s (not built into kernel)\n", origin, name); } /* Process "security=", if given. */ if (chosen_major_lsm) { for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; if (strcmp(lsm->name, chosen_major_lsm) == 0) append_ordered_lsm(lsm, "security="); } } /* LSM_ORDER_LAST is always last. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_LAST) append_ordered_lsm(lsm, " last"); } /* Disable all LSMs not in the ordered list. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; set_enabled(lsm, false); init_debug("%s skipped: %s (not in requested order)\n", origin, lsm->name); } kfree(sep); } static void __init lsm_static_call_init(struct security_hook_list *hl) { struct lsm_static_call *scall = hl->scalls; int i; for (i = 0; i < MAX_LSM_COUNT; i++) { /* Update the first static call that is not used yet */ if (!scall->hl) { __static_call_update(scall->key, scall->trampoline, hl->hook.lsm_func_addr); scall->hl = hl; static_branch_enable(scall->active); return; } scall++; } panic("%s - Ran out of static slots.\n", __func__); } static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); static int lsm_append(const char *new, char **result); static void __init report_lsm_order(void) { struct lsm_info **lsm, *early; int first = 0; pr_info("initializing lsm="); /* Report each enabled LSM name, comma separated. */ for (early = __start_early_lsm_info; early < __end_early_lsm_info; early++) if (is_enabled(early)) pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); for (lsm = ordered_lsms; *lsm; lsm++) if (is_enabled(*lsm)) pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name); pr_cont("\n"); } static void __init ordered_lsm_init(void) { struct lsm_info **lsm; if (chosen_lsm_order) { if (chosen_major_lsm) { pr_warn("security=%s is ignored because it is superseded by lsm=%s\n", chosen_major_lsm, chosen_lsm_order); chosen_major_lsm = NULL; } ordered_lsm_parse(chosen_lsm_order, "cmdline"); } else ordered_lsm_parse(builtin_lsm_order, "builtin"); for (lsm = ordered_lsms; *lsm; lsm++) prepare_lsm(*lsm); report_lsm_order(); init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); init_debug("file blob size = %d\n", blob_sizes.lbs_file); init_debug("ib blob size = %d\n", blob_sizes.lbs_ib); init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); #ifdef CONFIG_KEYS init_debug("key blob size = %d\n", blob_sizes.lbs_key); #endif /* CONFIG_KEYS */ init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); init_debug("sock blob size = %d\n", blob_sizes.lbs_sock); init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event); init_debug("task blob size = %d\n", blob_sizes.lbs_task); init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); /* * Create any kmem_caches needed for blobs */ if (blob_sizes.lbs_file) lsm_file_cache = kmem_cache_create("lsm_file_cache", blob_sizes.lbs_file, 0, SLAB_PANIC, NULL); if (blob_sizes.lbs_inode) lsm_inode_cache = kmem_cache_create("lsm_inode_cache", blob_sizes.lbs_inode, 0, SLAB_PANIC, NULL); lsm_early_cred((struct cred *) current->cred); lsm_early_task(current); for (lsm = ordered_lsms; *lsm; lsm++) initialize_lsm(*lsm); } int __init early_security_init(void) { struct lsm_info *lsm; for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; prepare_lsm(lsm); initialize_lsm(lsm); } return 0; } /** * security_init - initializes the security framework * * This should be called early in the kernel initialization sequence. */ int __init security_init(void) { struct lsm_info *lsm; init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*"); init_debug(" CONFIG_LSM=%s\n", builtin_lsm_order); init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*"); /* * Append the names of the early LSM modules now that kmalloc() is * available */ for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { init_debug(" early started: %s (%s)\n", lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); if (lsm->enabled) lsm_append(lsm->name, &lsm_names); } /* Load LSMs in specified order. */ ordered_lsm_init(); return 0; } /* Save user chosen LSM */ static int __init choose_major_lsm(char *str) { chosen_major_lsm = str; return 1; } __setup("security=", choose_major_lsm); /* Explicitly choose LSM initialization order. */ static int __init choose_lsm_order(char *str) { chosen_lsm_order = str; return 1; } __setup("lsm=", choose_lsm_order); /* Enable LSM order debugging. */ static int __init enable_debug(char *str) { debug = true; return 1; } __setup("lsm.debug", enable_debug); static bool match_last_lsm(const char *list, const char *lsm) { const char *last; if (WARN_ON(!list || !lsm)) return false; last = strrchr(list, ','); if (last) /* Pass the comma, strcmp() will check for '\0' */ last++; else last = list; return !strcmp(last, lsm); } static int lsm_append(const char *new, char **result) { char *cp; if (*result == NULL) { *result = kstrdup(new, GFP_KERNEL); if (*result == NULL) return -ENOMEM; } else { /* Check if it is the last registered name */ if (match_last_lsm(*result, new)) return 0; cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); if (cp == NULL) return -ENOMEM; kfree(*result); *result = cp; } return 0; } /** * security_add_hooks - Add a modules hooks to the hook lists. * @hooks: the hooks to add * @count: the number of hooks to add * @lsmid: the identification information for the security module * * Each LSM has to register its hooks with the infrastructure. */ void __init security_add_hooks(struct security_hook_list *hooks, int count, const struct lsm_id *lsmid) { int i; /* * A security module may call security_add_hooks() more * than once during initialization, and LSM initialization * is serialized. Landlock is one such case. * Look at the previous entry, if there is one, for duplication. */ if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) { if (lsm_active_cnt >= MAX_LSM_COUNT) panic("%s Too many LSMs registered.\n", __func__); lsm_idlist[lsm_active_cnt++] = lsmid; } for (i = 0; i < count; i++) { hooks[i].lsmid = lsmid; lsm_static_call_init(&hooks[i]); } /* * Don't try to append during early_security_init(), we'll come back * and fix this up afterwards. */ if (slab_is_available()) { if (lsm_append(lsmid->name, &lsm_names) < 0) panic("%s - Cannot get early memory.\n", __func__); } } int call_blocking_lsm_notifier(enum lsm_event event, void *data) { return blocking_notifier_call_chain(&blocking_lsm_notifier_chain, event, data); } EXPORT_SYMBOL(call_blocking_lsm_notifier); int register_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(register_blocking_lsm_notifier); int unregister_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(unregister_blocking_lsm_notifier); /** * lsm_blob_alloc - allocate a composite blob * @dest: the destination for the blob * @size: the size of the blob * @gfp: allocation type * * Allocate a blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp) { if (size == 0) { *dest = NULL; return 0; } *dest = kzalloc(size, gfp); if (*dest == NULL) return -ENOMEM; return 0; } /** * lsm_cred_alloc - allocate a composite cred blob * @cred: the cred that needs a blob * @gfp: allocation type * * Allocate the cred blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_cred_alloc(struct cred *cred, gfp_t gfp) { return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp); } /** * lsm_early_cred - during initialization allocate a composite cred blob * @cred: the cred that needs a blob * * Allocate the cred blob for all the modules */ static void __init lsm_early_cred(struct cred *cred) { int rc = lsm_cred_alloc(cred, GFP_KERNEL); if (rc) panic("%s: Early cred alloc failed.\n", __func__); } /** * lsm_file_alloc - allocate a composite file blob * @file: the file that needs a blob * * Allocate the file blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_file_alloc(struct file *file) { if (!lsm_file_cache) { file->f_security = NULL; return 0; } file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL); if (file->f_security == NULL) return -ENOMEM; return 0; } /** * lsm_inode_alloc - allocate a composite inode blob * @inode: the inode that needs a blob * @gfp: allocation flags * * Allocate the inode blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_inode_alloc(struct inode *inode, gfp_t gfp) { if (!lsm_inode_cache) { inode->i_security = NULL; return 0; } inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp); if (inode->i_security == NULL) return -ENOMEM; return 0; } /** * lsm_task_alloc - allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_task_alloc(struct task_struct *task) { return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL); } /** * lsm_ipc_alloc - allocate a composite ipc blob * @kip: the ipc that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_ipc_alloc(struct kern_ipc_perm *kip) { return lsm_blob_alloc(&kip->security, blob_sizes.lbs_ipc, GFP_KERNEL); } #ifdef CONFIG_KEYS /** * lsm_key_alloc - allocate a composite key blob * @key: the key that needs a blob * * Allocate the key blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_key_alloc(struct key *key) { return lsm_blob_alloc(&key->security, blob_sizes.lbs_key, GFP_KERNEL); } #endif /* CONFIG_KEYS */ /** * lsm_msg_msg_alloc - allocate a composite msg_msg blob * @mp: the msg_msg that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_msg_msg_alloc(struct msg_msg *mp) { return lsm_blob_alloc(&mp->security, blob_sizes.lbs_msg_msg, GFP_KERNEL); } /** * lsm_bdev_alloc - allocate a composite block_device blob * @bdev: the block_device that needs a blob * * Allocate the block_device blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_bdev_alloc(struct block_device *bdev) { if (blob_sizes.lbs_bdev == 0) { bdev->bd_security = NULL; return 0; } bdev->bd_security = kzalloc(blob_sizes.lbs_bdev, GFP_KERNEL); if (!bdev->bd_security) return -ENOMEM; return 0; } /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules */ static void __init lsm_early_task(struct task_struct *task) { int rc = lsm_task_alloc(task); if (rc) panic("%s: Early task alloc failed.\n", __func__); } /** * lsm_superblock_alloc - allocate a composite superblock blob * @sb: the superblock that needs a blob * * Allocate the superblock blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_superblock_alloc(struct super_block *sb) { return lsm_blob_alloc(&sb->s_security, blob_sizes.lbs_superblock, GFP_KERNEL); } /** * lsm_fill_user_ctx - Fill a user space lsm_ctx structure * @uctx: a userspace LSM context to be filled * @uctx_len: available uctx size (input), used uctx size (output) * @val: the new LSM context value * @val_len: the size of the new LSM context value * @id: LSM id * @flags: LSM defined flags * * Fill all of the fields in a userspace lsm_ctx structure. If @uctx is NULL * simply calculate the required size to output via @utc_len and return * success. * * Returns 0 on success, -E2BIG if userspace buffer is not large enough, * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated. */ int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len, void *val, size_t val_len, u64 id, u64 flags) { struct lsm_ctx *nctx = NULL; size_t nctx_len; int rc = 0; nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *)); if (nctx_len > *uctx_len) { rc = -E2BIG; goto out; } /* no buffer - return success/0 and set @uctx_len to the req size */ if (!uctx) goto out; nctx = kzalloc(nctx_len, GFP_KERNEL); if (nctx == NULL) { rc = -ENOMEM; goto out; } nctx->id = id; nctx->flags = flags; nctx->len = nctx_len; nctx->ctx_len = val_len; memcpy(nctx->ctx, val, val_len); if (copy_to_user(uctx, nctx, nctx_len)) rc = -EFAULT; out: kfree(nctx); *uctx_len = nctx_len; return rc; } /* * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and * can be accessed with: * * LSM_RET_DEFAULT(<hook_name>) * * The macros below define static constants for the default value of each * LSM hook. */ #define LSM_RET_DEFAULT(NAME) (NAME##_default) #define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME) #define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \ static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT); #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK /* * Hook list operation macros. * * call_void_hook: * This is a hook that does not return a value. * * call_int_hook: * This is a hook that returns a value. */ #define __CALL_STATIC_VOID(NUM, HOOK, ...) \ do { \ if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) { \ static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__); \ } \ } while (0); #define call_void_hook(HOOK, ...) \ do { \ LSM_LOOP_UNROLL(__CALL_STATIC_VOID, HOOK, __VA_ARGS__); \ } while (0) #define __CALL_STATIC_INT(NUM, R, HOOK, LABEL, ...) \ do { \ if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) { \ R = static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__); \ if (R != LSM_RET_DEFAULT(HOOK)) \ goto LABEL; \ } \ } while (0); #define call_int_hook(HOOK, ...) \ ({ \ __label__ OUT; \ int RC = LSM_RET_DEFAULT(HOOK); \ \ LSM_LOOP_UNROLL(__CALL_STATIC_INT, RC, HOOK, OUT, __VA_ARGS__); \ OUT: \ RC; \ }) #define lsm_for_each_hook(scall, NAME) \ for (scall = static_calls_table.NAME; \ scall - static_calls_table.NAME < MAX_LSM_COUNT; scall++) \ if (static_key_enabled(&scall->active->key)) /* Security operations */ /** * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok * @mgr: task credentials of current binder process * * Check whether @mgr is allowed to be the binder context manager. * * Return: Return 0 if permission is granted. */ int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, mgr); } /** * security_binder_transaction() - Check if a binder transaction is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to invoke a binder transaction call to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transaction(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transaction, from, to); } /** * security_binder_transfer_binder() - Check if a binder transfer is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to transfer a binder reference to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_binder(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transfer_binder, from, to); } /** * security_binder_transfer_file() - Check if a binder file xfer is allowed * @from: sending process * @to: receiving process * @file: file being transferred * * Check whether @from is allowed to transfer @file to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_file(const struct cred *from, const struct cred *to, const struct file *file) { return call_int_hook(binder_transfer_file, from, to, file); } /** * security_ptrace_access_check() - Check if tracing is allowed * @child: target process * @mode: PTRACE_MODE flags * * Check permission before allowing the current process to trace the @child * process. Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of tracing check * during an execve in the bprm_set_creds hook of binprm_security_ops if the * process is being traced and its security attributes would be changed by the * execve. * * Return: Returns 0 if permission is granted. */ int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { return call_int_hook(ptrace_access_check, child, mode); } /** * security_ptrace_traceme() - Check if tracing is allowed * @parent: tracing process * * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself to the * @parent process for tracing. * * Return: Returns 0 if permission is granted. */ int security_ptrace_traceme(struct task_struct *parent) { return call_int_hook(ptrace_traceme, parent); } /** * security_capget() - Get the capability sets for a process * @target: target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Get the @effective, @inheritable, and @permitted capability sets for the * @target process. The hook may also perform permission checking to determine * if the current process is allowed to see the capability sets of the @target * process. * * Return: Returns 0 if the capability sets were successfully obtained. */ int security_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return call_int_hook(capget, target, effective, inheritable, permitted); } /** * security_capset() - Set the capability sets for a process * @new: new credentials for the target process * @old: current credentials of the target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Set the @effective, @inheritable, and @permitted capability sets for the * current process. * * Return: Returns 0 and update @new if permission is granted. */ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return call_int_hook(capset, new, old, effective, inheritable, permitted); } /** * security_capable() - Check if a process has the necessary capability * @cred: credentials to examine * @ns: user namespace * @cap: capability requested * @opts: capability check options * * Check whether the @tsk process has the @cap capability in the indicated * credentials. @cap contains the capability <include/linux/capability.h>. * @opts contains options for the capable check <include/linux/security.h>. * * Return: Returns 0 if the capability is granted. */ int security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return call_int_hook(capable, cred, ns, cap, opts); } /** * security_quotactl() - Check if a quotactl() syscall is allowed for this fs * @cmds: commands * @type: type * @id: id * @sb: filesystem * * Check whether the quotactl syscall is allowed for this @sb. * * Return: Returns 0 if permission is granted. */ int security_quotactl(int cmds, int type, int id, const struct super_block *sb) { return call_int_hook(quotactl, cmds, type, id, sb); } /** * security_quota_on() - Check if QUOTAON is allowed for a dentry * @dentry: dentry * * Check whether QUOTAON is allowed for @dentry. * * Return: Returns 0 if permission is granted. */ int security_quota_on(struct dentry *dentry) { return call_int_hook(quota_on, dentry); } /** * security_syslog() - Check if accessing the kernel message ring is allowed * @type: SYSLOG_ACTION_* type * * Check permission before accessing the kernel message ring or changing * logging to the console. See the syslog(2) manual page for an explanation of * the @type values. * * Return: Return 0 if permission is granted. */ int security_syslog(int type) { return call_int_hook(syslog, type); } /** * security_settime64() - Check if changing the system time is allowed * @ts: new time * @tz: timezone * * Check permission to change the system time, struct timespec64 is defined in * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>. * * Return: Returns 0 if permission is granted. */ int security_settime64(const struct timespec64 *ts, const struct timezone *tz) { return call_int_hook(settime, ts, tz); } /** * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed * @mm: mm struct * @pages: number of pages * * Check permissions for allocating a new virtual mapping. If all LSMs return * a positive value, __vm_enough_memory() will be called with cap_sys_admin * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be * called with cap_sys_admin cleared. * * Return: Returns 0 if permission is granted by the LSM infrastructure to the * caller. */ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { struct lsm_static_call *scall; int cap_sys_admin = 1; int rc; /* * The module will respond with 0 if it thinks the __vm_enough_memory() * call should be made with the cap_sys_admin set. If all of the modules * agree that it should be set it will. If any module thinks it should * not be set it won't. */ lsm_for_each_hook(scall, vm_enough_memory) { rc = scall->hl->hook.vm_enough_memory(mm, pages); if (rc < 0) { cap_sys_admin = 0; break; } } return __vm_enough_memory(mm, pages, cap_sys_admin); } /** * security_bprm_creds_for_exec() - Prepare the credentials for exec() * @bprm: binary program information * * If the setup in prepare_exec_creds did not setup @bprm->cred->security * properly for executing @bprm->file, update the LSM's portion of * @bprm->cred->security to be what commit_creds needs to install for the new * program. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. @bprm * contains the linux_binprm structure. * * If execveat(2) is called with the AT_EXECVE_CHECK flag, bprm->is_check is * set. The result must be the same as without this flag even if the execution * will never really happen and @bprm will always be dropped. * * This hook must not change current->cred, only @bprm->cred. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_for_exec(struct linux_binprm *bprm) { return call_int_hook(bprm_creds_for_exec, bprm); } /** * security_bprm_creds_from_file() - Update linux_binprm creds based on file * @bprm: binary program information * @file: associated file * * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon * exec, update @bprm->cred to reflect that change. This is called after * finding the binary that will be executed without an interpreter. This * ensures that the credentials will not be derived from a script that the * binary will need to reopen, which when reopend may end up being a completely * different file. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. The * hook must add to @bprm->per_clear any personality flags that should be * cleared from current->personality. @bprm contains the linux_binprm * structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file) { return call_int_hook(bprm_creds_from_file, bprm, file); } /** * security_bprm_check() - Mediate binary handler search * @bprm: binary program information * * This hook mediates the point when a search for a binary handler will begin. * It allows a check against the @bprm->cred->security value which was set in * the preceding creds_for_exec call. The argv list and envp list are reliably * available in @bprm. This hook may be called multiple times during a single * execve. @bprm contains the linux_binprm structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_check(struct linux_binprm *bprm) { return call_int_hook(bprm_check_security, bprm); } /** * security_bprm_committing_creds() - Install creds for a process during exec() * @bprm: binary program information * * Prepare to install the new security attributes of a process being * transformed by an execve operation, based on the old credentials pointed to * by @current->cred and the information set in @bprm->cred by the * bprm_creds_for_exec hook. @bprm points to the linux_binprm structure. This * hook is a good place to perform state changes on the process such as closing * open file descriptors to which access will no longer be granted when the * attributes are changed. This is called immediately before commit_creds(). */ void security_bprm_committing_creds(const struct linux_binprm *bprm) { call_void_hook(bprm_committing_creds, bprm); } /** * security_bprm_committed_creds() - Tidy up after cred install during exec() * @bprm: binary program information * * Tidy up after the installation of the new security attributes of a process * being transformed by an execve operation. The new credentials have, by this * point, been set to @current->cred. @bprm points to the linux_binprm * structure. This hook is a good place to perform state changes on the * process such as clearing out non-inheritable signal state. This is called * immediately after commit_creds(). */ void security_bprm_committed_creds(const struct linux_binprm *bprm) { call_void_hook(bprm_committed_creds, bprm); } /** * security_fs_context_submount() - Initialise fc->security * @fc: new filesystem context * @reference: dentry reference for submount/remount * * Fill out the ->security field for a new fs_context. * * Return: Returns 0 on success or negative error code on failure. */ int security_fs_context_submount(struct fs_context *fc, struct super_block *reference) { return call_int_hook(fs_context_submount, fc, reference); } /** * security_fs_context_dup() - Duplicate a fs_context LSM blob * @fc: destination filesystem context * @src_fc: source filesystem context * * Allocate and attach a security structure to sc->security. This pointer is * initialised to NULL by the caller. @fc indicates the new filesystem context. * @src_fc indicates the original filesystem context. * * Return: Returns 0 on success or a negative error code on failure. */ int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { return call_int_hook(fs_context_dup, fc, src_fc); } /** * security_fs_context_parse_param() - Configure a filesystem context * @fc: filesystem context * @param: filesystem parameter * * Userspace provided a parameter to configure a superblock. The LSM can * consume the parameter or return it to the caller for use elsewhere. * * Return: If the parameter is used by the LSM it should return 0, if it is * returned to the caller -ENOPARAM is returned, otherwise a negative * error code is returned. */ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct lsm_static_call *scall; int trc; int rc = -ENOPARAM; lsm_for_each_hook(scall, fs_context_parse_param) { trc = scall->hl->hook.fs_context_parse_param(fc, param); if (trc == 0) rc = 0; else if (trc != -ENOPARAM) return trc; } return rc; } /** * security_sb_alloc() - Allocate a super_block LSM blob * @sb: filesystem superblock * * Allocate and attach a security structure to the sb->s_security field. The * s_security field is initialized to NULL when the structure is allocated. * @sb contains the super_block structure to be modified. * * Return: Returns 0 if operation was successful. */ int security_sb_alloc(struct super_block *sb) { int rc = lsm_superblock_alloc(sb); if (unlikely(rc)) return rc; rc = call_int_hook(sb_alloc_security, sb); if (unlikely(rc)) security_sb_free(sb); return rc; } /** * security_sb_delete() - Release super_block LSM associated objects * @sb: filesystem superblock * * Release objects tied to a superblock (e.g. inodes). @sb contains the * super_block structure being released. */ void security_sb_delete(struct super_block *sb) { call_void_hook(sb_delete, sb); } /** * security_sb_free() - Free a super_block LSM blob * @sb: filesystem superblock * * Deallocate and clear the sb->s_security field. @sb contains the super_block * structure to be modified. */ void security_sb_free(struct super_block *sb) { call_void_hook(sb_free_security, sb); kfree(sb->s_security); sb->s_security = NULL; } /** * security_free_mnt_opts() - Free memory associated with mount options * @mnt_opts: LSM processed mount options * * Free memory associated with @mnt_ops. */ void security_free_mnt_opts(void **mnt_opts) { if (!*mnt_opts) return; call_void_hook(sb_free_mnt_opts, *mnt_opts); *mnt_opts = NULL; } EXPORT_SYMBOL(security_free_mnt_opts); /** * security_sb_eat_lsm_opts() - Consume LSM mount options * @options: mount options * @mnt_opts: LSM processed mount options * * Eat (scan @options) and save them in @mnt_opts. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_eat_lsm_opts(char *options, void **mnt_opts) { return call_int_hook(sb_eat_lsm_opts, options, mnt_opts); } EXPORT_SYMBOL(security_sb_eat_lsm_opts); /** * security_sb_mnt_opts_compat() - Check if new mount options are allowed * @sb: filesystem superblock * @mnt_opts: new mount options * * Determine if the new mount options in @mnt_opts are allowed given the * existing mounted filesystem at @sb. @sb superblock being compared. * * Return: Returns 0 if options are compatible. */ int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_mnt_opts_compat, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_mnt_opts_compat); /** * security_sb_remount() - Verify no incompatible mount changes during remount * @sb: filesystem superblock * @mnt_opts: (re)mount options * * Extracts security system specific mount options and verifies no changes are * being made to those options. * * Return: Returns 0 if permission is granted. */ int security_sb_remount(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_remount, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_remount); /** * security_sb_kern_mount() - Check if a kernel mount is allowed * @sb: filesystem superblock * * Mount this @sb if allowed by permissions. * * Return: Returns 0 if permission is granted. */ int security_sb_kern_mount(const struct super_block *sb) { return call_int_hook(sb_kern_mount, sb); } /** * security_sb_show_options() - Output the mount options for a superblock * @m: output file * @sb: filesystem superblock * * Show (print on @m) mount options for this @sb. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_show_options(struct seq_file *m, struct super_block *sb) { return call_int_hook(sb_show_options, m, sb); } /** * security_sb_statfs() - Check if accessing fs stats is allowed * @dentry: superblock handle * * Check permission before obtaining filesystem statistics for the @mnt * mountpoint. @dentry is a handle on the superblock for the filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_statfs(struct dentry *dentry) { return call_int_hook(sb_statfs, dentry); } /** * security_sb_mount() - Check permission for mounting a filesystem * @dev_name: filesystem backing device * @path: mount point * @type: filesystem type * @flags: mount flags * @data: filesystem specific data * * Check permission before an object specified by @dev_name is mounted on the * mount point named by @nd. For an ordinary mount, @dev_name identifies a * device if the file system type requires a device. For a remount * (@flags & MS_REMOUNT), @dev_name is irrelevant. For a loopback/bind mount * (@flags & MS_BIND), @dev_name identifies the pathname of the object being * mounted. * * Return: Returns 0 if permission is granted. */ int security_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return call_int_hook(sb_mount, dev_name, path, type, flags, data); } /** * security_sb_umount() - Check permission for unmounting a filesystem * @mnt: mounted filesystem * @flags: unmount flags * * Check permission before the @mnt file system is unmounted. * * Return: Returns 0 if permission is granted. */ int security_sb_umount(struct vfsmount *mnt, int flags) { return call_int_hook(sb_umount, mnt, flags); } /** * security_sb_pivotroot() - Check permissions for pivoting the rootfs * @old_path: new location for current rootfs * @new_path: location of the new rootfs * * Check permission before pivoting the root filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return call_int_hook(sb_pivotroot, old_path, new_path); } /** * security_sb_set_mnt_opts() - Set the mount options for a filesystem * @sb: filesystem superblock * @mnt_opts: binary mount options * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Set the security relevant mount options used for a superblock. * * Return: Returns 0 on success, error on failure. */ int security_sb_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { struct lsm_static_call *scall; int rc = mnt_opts ? -EOPNOTSUPP : LSM_RET_DEFAULT(sb_set_mnt_opts); lsm_for_each_hook(scall, sb_set_mnt_opts) { rc = scall->hl->hook.sb_set_mnt_opts(sb, mnt_opts, kern_flags, set_kern_flags); if (rc != LSM_RET_DEFAULT(sb_set_mnt_opts)) break; } return rc; } EXPORT_SYMBOL(security_sb_set_mnt_opts); /** * security_sb_clone_mnt_opts() - Duplicate superblock mount options * @oldsb: source superblock * @newsb: destination superblock * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Copy all security options from a given superblock to another. * * Return: Returns 0 on success, error on failure. */ int security_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_clone_mnt_opts, oldsb, newsb, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_clone_mnt_opts); /** * security_move_mount() - Check permissions for moving a mount * @from_path: source mount point * @to_path: destination mount point * * Check permission before a mount is moved. * * Return: Returns 0 if permission is granted. */ int security_move_mount(const struct path *from_path, const struct path *to_path) { return call_int_hook(move_mount, from_path, to_path); } /** * security_path_notify() - Check if setting a watch is allowed * @path: file path * @mask: event mask * @obj_type: file path type * * Check permissions before setting a watch on events as defined by @mask, on * an object at @path, whose type is defined by @obj_type. * * Return: Returns 0 if permission is granted. */ int security_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { return call_int_hook(path_notify, path, mask, obj_type); } /** * security_inode_alloc() - Allocate an inode LSM blob * @inode: the inode * @gfp: allocation flags * * Allocate and attach a security structure to @inode->i_security. The * i_security field is initialized to NULL when the inode structure is * allocated. * * Return: Return 0 if operation was successful. */ int security_inode_alloc(struct inode *inode, gfp_t gfp) { int rc = lsm_inode_alloc(inode, gfp); if (unlikely(rc)) return rc; rc = call_int_hook(inode_alloc_security, inode); if (unlikely(rc)) security_inode_free(inode); return rc; } static void inode_free_by_rcu(struct rcu_head *head) { /* The rcu head is at the start of the inode blob */ call_void_hook(inode_free_security_rcu, head); kmem_cache_free(lsm_inode_cache, head); } /** * security_inode_free() - Free an inode's LSM blob * @inode: the inode * * Release any LSM resources associated with @inode, although due to the * inode's RCU protections it is possible that the resources will not be * fully released until after the current RCU grace period has elapsed. * * It is important for LSMs to note that despite being present in a call to * security_inode_free(), @inode may still be referenced in a VFS path walk * and calls to security_inode_permission() may be made during, or after, * a call to security_inode_free(). For this reason the inode->i_security * field is released via a call_rcu() callback and any LSMs which need to * retain inode state for use in security_inode_permission() should only * release that state in the inode_free_security_rcu() LSM hook callback. */ void security_inode_free(struct inode *inode) { call_void_hook(inode_free_security, inode); if (!inode->i_security) return; call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu); } /** * security_dentry_init_security() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @xattr_name: name of the security/LSM xattr * @lsmctx: pointer to the resulting LSM context * * Compute a context for a dentry as the inode is not yet available since NFSv4 * has no label backed by an EA anyway. It is important to note that * @xattr_name does not need to be free'd by the caller, it is a static string. * * Return: Returns 0 on success, negative values on failure. */ int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, struct lsm_context *lsmctx) { return call_int_hook(dentry_init_security, dentry, mode, name, xattr_name, lsmctx); } EXPORT_SYMBOL(security_dentry_init_security); /** * security_dentry_create_files_as() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @old: creds to use for LSM context calculations * @new: creds to modify * * Compute a context for a dentry as the inode is not yet available and set * that context in passed in creds so that new files are created using that * context. Context is calculated using the passed in creds and not the creds * of the caller. * * Return: Returns 0 on success, error on failure. */ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, dentry, mode, name, old, new); } EXPORT_SYMBOL(security_dentry_create_files_as); /** * security_inode_init_security() - Initialize an inode's LSM context * @inode: the inode * @dir: parent directory * @qstr: last component of the pathname * @initxattrs: callback function to write xattrs * @fs_data: filesystem specific data * * Obtain the security attribute name suffix and value to set on a newly * created inode and set up the incore security field for the new inode. This * hook is called by the fs code as part of the inode creation transaction and * provides for atomic labeling of the inode, unlike the post_create/mkdir/... * hooks called by the VFS. * * The hook function is expected to populate the xattrs array, by calling * lsm_get_xattr_slot() to retrieve the slots reserved by the security module * with the lbs_xattr_count field of the lsm_blob_sizes structure. For each * slot, the hook function should set ->name to the attribute name suffix * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it * to the attribute value, to set ->value_len to the length of the value. If * the security module does not use security attributes or does not wish to put * a security attribute on this particular inode, then it should return * -EOPNOTSUPP to skip this processing. * * Return: Returns 0 if the LSM successfully initialized all of the inode * security attributes that are required, negative values otherwise. */ int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const initxattrs initxattrs, void *fs_data) { struct lsm_static_call *scall; struct xattr *new_xattrs = NULL; int ret = -EOPNOTSUPP, xattr_count = 0; if (unlikely(IS_PRIVATE(inode))) return 0; if (!blob_sizes.lbs_xattr_count) return 0; if (initxattrs) { /* Allocate +1 as terminator. */ new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 1, sizeof(*new_xattrs), GFP_NOFS); if (!new_xattrs) return -ENOMEM; } lsm_for_each_hook(scall, inode_init_security) { ret = scall->hl->hook.inode_init_security(inode, dir, qstr, new_xattrs, &xattr_count); if (ret && ret != -EOPNOTSUPP) goto out; /* * As documented in lsm_hooks.h, -EOPNOTSUPP in this context * means that the LSM is not willing to provide an xattr, not * that it wants to signal an error. Thus, continue to invoke * the remaining LSMs. */ } /* If initxattrs() is NULL, xattr_count is zero, skip the call. */ if (!xattr_count) goto out; ret = initxattrs(inode, new_xattrs, fs_data); out: for (; xattr_count > 0; xattr_count--) kfree(new_xattrs[xattr_count - 1].value); kfree(new_xattrs); return (ret == -EOPNOTSUPP) ? 0 : ret; } EXPORT_SYMBOL(security_inode_init_security); /** * security_inode_init_security_anon() - Initialize an anonymous inode * @inode: the inode * @name: the anonymous inode class * @context_inode: an optional related inode * * Set up the incore security field for the new anonymous inode and return * whether the inode creation is permitted by the security module or not. * * Return: Returns 0 on success, -EACCES if the security module denies the * creation of this inode, or another -errno upon other errors. */ int security_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode) { return call_int_hook(inode_init_security_anon, inode, name, context_inode); } #ifdef CONFIG_SECURITY_PATH /** * security_path_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a file. Note that this hook is called even * if mknod operation is being done for a regular file. * * Return: Returns 0 if permission is granted. */ int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mknod, dir, dentry, mode, dev); } EXPORT_SYMBOL(security_path_mknod); /** * security_path_post_mknod() - Update inode security after reg file creation * @idmap: idmap of the mount * @dentry: new file * * Update inode security field after a regular file has been created. */ void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(path_post_mknod, idmap, dentry); } /** * security_path_mkdir() - Check if creating a new directory is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory. * * Return: Returns 0 if permission is granted. */ int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mkdir, dir, dentry, mode); } EXPORT_SYMBOL(security_path_mkdir); /** * security_path_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to remove * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_path_rmdir(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_rmdir, dir, dentry); } /** * security_path_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_unlink(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_unlink, dir, dentry); } EXPORT_SYMBOL(security_path_unlink); /** * security_path_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: file pathname * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_symlink, dir, dentry, old_name); } /** * security_path_link - Check if creating a hard link is allowed * @old_dentry: existing file * @new_dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(path_link, old_dentry, new_dir, new_dentry); } /** * security_path_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; return call_int_hook(path_rename, old_dir, old_dentry, new_dir, new_dentry, flags); } EXPORT_SYMBOL(security_path_rename); /** * security_path_truncate() - Check if truncating a file is allowed * @path: file * * Check permission before truncating the file indicated by path. Note that * truncation permissions may also be checked based on already opened files, * using the security_file_truncate() hook. * * Return: Returns 0 if permission is granted. */ int security_path_truncate(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_truncate, path); } /** * security_path_chmod() - Check if changing the file's mode is allowed * @path: file * @mode: new mode * * Check for permission to change a mode of the file @path. The new mode is * specified in @mode which is a bitmask of constants from * <include/uapi/linux/stat.h>. * * Return: Returns 0 if permission is granted. */ int security_path_chmod(const struct path *path, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chmod, path, mode); } /** * security_path_chown() - Check if changing the file's owner/group is allowed * @path: file * @uid: file owner * @gid: file group * * Check for permission to change owner/group of a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chown, path, uid, gid); } /** * security_path_chroot() - Check if changing the root directory is allowed * @path: directory * * Check for permission to change root directory. * * Return: Returns 0 if permission is granted. */ int security_path_chroot(const struct path *path) { return call_int_hook(path_chroot, path); } #endif /* CONFIG_SECURITY_PATH */ /** * security_inode_create() - Check if creating a file is allowed * @dir: the parent directory * @dentry: the file being created * @mode: requested file mode * * Check permission to create a regular file. * * Return: Returns 0 if permission is granted. */ int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_create, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_create); /** * security_inode_post_create_tmpfile() - Update inode security of new tmpfile * @idmap: idmap of the mount * @inode: inode of the new tmpfile * * Update inode security data after a tmpfile has been created. */ void security_inode_post_create_tmpfile(struct mnt_idmap *idmap, struct inode *inode) { if (unlikely(IS_PRIVATE(inode))) return; call_void_hook(inode_post_create_tmpfile, idmap, inode); } /** * security_inode_link() - Check if creating a hard link is allowed * @old_dentry: existing file * @dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(inode_link, old_dentry, dir, new_dentry); } /** * security_inode_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_unlink(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_unlink, dir, dentry); } /** * security_inode_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: existing filename * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_symlink(struct inode *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_symlink, dir, dentry, old_name); } /** * security_inode_mkdir() - Check if creation a new director is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory * associated with inode structure @dir. * * Return: Returns 0 if permission is granted. */ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mkdir, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_mkdir); /** * security_inode_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to be removed * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rmdir(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_rmdir, dir, dentry); } /** * security_inode_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a special file (or a socket or a fifo file * created via the mknod system call). Note that if mknod operation is being * done for a regular file, then the create hook will be called and not this * hook. * * Return: Returns 0 if permission is granted. */ int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mknod, dir, dentry, mode, dev); } /** * security_inode_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { int err = call_int_hook(inode_rename, new_dir, new_dentry, old_dir, old_dentry); if (err) return err; } return call_int_hook(inode_rename, old_dir, old_dentry, new_dir, new_dentry); } /** * security_inode_readlink() - Check if reading a symbolic link is allowed * @dentry: link * * Check the permission to read the symbolic link. * * Return: Returns 0 if permission is granted. */ int security_inode_readlink(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_readlink, dentry); } /** * security_inode_follow_link() - Check if following a symbolic link is allowed * @dentry: link dentry * @inode: link inode * @rcu: true if in RCU-walk mode * * Check permission to follow a symbolic link when looking up a pathname. If * @rcu is true, @inode is not stable. * * Return: Returns 0 if permission is granted. */ int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_follow_link, dentry, inode, rcu); } /** * security_inode_permission() - Check if accessing an inode is allowed * @inode: inode * @mask: access mask * * Check permission before accessing an inode. This hook is called by the * existing Linux permission function, so a security module can use it to * provide additional checking for existing Linux permission checks. Notice * that this hook is called when a file is opened (as well as many other * operations), whereas the file_security_ops permission hook is called when * the actual read/write operations are performed. * * Return: Returns 0 if permission is granted. */ int security_inode_permission(struct inode *inode, int mask) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_permission, inode, mask); } /** * security_inode_setattr() - Check if setting file attributes is allowed * @idmap: idmap of the mount * @dentry: file * @attr: new attributes * * Check permission before setting file attributes. Note that the kernel call * to notify_change is performed from several locations, whenever file * attributes change (such as when a file is truncated, chown/chmod operations, * transferring disk quotas, etc). * * Return: Returns 0 if permission is granted. */ int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_setattr, idmap, dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); /** * security_inode_post_setattr() - Update the inode after a setattr operation * @idmap: idmap of the mount * @dentry: file * @ia_valid: file attributes set * * Update inode security field after successful setting file attributes. */ void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int ia_valid) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_setattr, idmap, dentry, ia_valid); } /** * security_inode_getattr() - Check if getting file attributes is allowed * @path: file * * Check permission before obtaining file attributes. * * Return: Returns 0 if permission is granted. */ int security_inode_getattr(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(inode_getattr, path); } /** * security_inode_setxattr() - Check if setting file xattrs is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * @value: xattr value * @size: size of xattr value * @flags: flags * * This hook performs the desired permission checks before setting the extended * attributes (xattrs) on @dentry. It is important to note that we have some * additional logic before the main LSM implementation calls to detect if we * need to perform an additional capability check at the LSM layer. * * Normally we enforce a capability check prior to executing the various LSM * hook implementations, but if a LSM wants to avoid this capability check, * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for * xattrs that it wants to avoid the capability check, leaving the LSM fully * responsible for enforcing the access control for the specific xattr. If all * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook, * or return a 0 (the default return value), the capability check is still * performed. If no 'inode_xattr_skipcap' hooks are registered the capability * check is performed. * * Return: Returns 0 if permission is granted. */ int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int rc; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* enforce the capability checks at the lsm layer, if needed */ if (!call_int_hook(inode_xattr_skipcap, name)) { rc = cap_inode_setxattr(dentry, name, value, size, flags); if (rc) return rc; } return call_int_hook(inode_setxattr, idmap, dentry, name, value, size, flags); } /** * security_inode_set_acl() - Check if setting posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * @kacl: acl struct * * Check permission before setting posix acls, the posix acls in @kacl are * identified by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_set_acl, idmap, dentry, acl_name, kacl); } /** * security_inode_post_set_acl() - Update inode security from posix acls set * @dentry: file * @acl_name: acl name * @kacl: acl struct * * Update inode security data after successfully setting posix acls on @dentry. * The posix acls in @kacl are identified by @acl_name. */ void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_set_acl, dentry, acl_name, kacl); } /** * security_inode_get_acl() - Check if reading posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before getting osix acls, the posix acls are identified by * @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_get_acl, idmap, dentry, acl_name); } /** * security_inode_remove_acl() - Check if removing a posix acl is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before removing posix acls, the posix acls are identified * by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_remove_acl, idmap, dentry, acl_name); } /** * security_inode_post_remove_acl() - Update inode security after rm posix acls * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Update inode security data after successfully removing posix acls on * @dentry in @idmap. The posix acls are identified by @acl_name. */ void security_inode_post_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_remove_acl, idmap, dentry, acl_name); } /** * security_inode_post_setxattr() - Update the inode after a setxattr operation * @dentry: file * @name: xattr name * @value: xattr value * @size: xattr value size * @flags: flags * * Update inode security field after successful setxattr operation. */ void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_setxattr, dentry, name, value, size, flags); } /** * security_inode_getxattr() - Check if xattr access is allowed * @dentry: file * @name: xattr name * * Check permission before obtaining the extended attributes identified by * @name for @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_getxattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_getxattr, dentry, name); } /** * security_inode_listxattr() - Check if listing xattrs is allowed * @dentry: file * * Check permission before obtaining the list of extended attribute names for * @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_listxattr(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_listxattr, dentry); } /** * security_inode_removexattr() - Check if removing an xattr is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * * This hook performs the desired permission checks before setting the extended * attributes (xattrs) on @dentry. It is important to note that we have some * additional logic before the main LSM implementation calls to detect if we * need to perform an additional capability check at the LSM layer. * * Normally we enforce a capability check prior to executing the various LSM * hook implementations, but if a LSM wants to avoid this capability check, * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for * xattrs that it wants to avoid the capability check, leaving the LSM fully * responsible for enforcing the access control for the specific xattr. If all * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook, * or return a 0 (the default return value), the capability check is still * performed. If no 'inode_xattr_skipcap' hooks are registered the capability * check is performed. * * Return: Returns 0 if permission is granted. */ int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { int rc; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* enforce the capability checks at the lsm layer, if needed */ if (!call_int_hook(inode_xattr_skipcap, name)) { rc = cap_inode_removexattr(idmap, dentry, name); if (rc) return rc; } return call_int_hook(inode_removexattr, idmap, dentry, name); } /** * security_inode_post_removexattr() - Update the inode after a removexattr op * @dentry: file * @name: xattr name * * Update the inode after a successful removexattr operation. */ void security_inode_post_removexattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_removexattr, dentry, name); } /** * security_inode_need_killpriv() - Check if security_inode_killpriv() required * @dentry: associated dentry * * Called when an inode has been changed to determine if * security_inode_killpriv() should be called. * * Return: Return <0 on error to abort the inode change operation, return 0 if * security_inode_killpriv() does not need to be called, return >0 if * security_inode_killpriv() does need to be called. */ int security_inode_need_killpriv(struct dentry *dentry) { return call_int_hook(inode_need_killpriv, dentry); } /** * security_inode_killpriv() - The setuid bit is removed, update LSM state * @idmap: idmap of the mount * @dentry: associated dentry * * The @dentry's setuid bit is being removed. Remove similar security labels. * Called with the dentry->d_inode->i_mutex held. * * Return: Return 0 on success. If error is returned, then the operation * causing setuid bit removal is failed. */ int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { return call_int_hook(inode_killpriv, idmap, dentry); } /** * security_inode_getsecurity() - Get the xattr security label of an inode * @idmap: idmap of the mount * @inode: inode * @name: xattr name * @buffer: security label buffer * @alloc: allocation flag * * Retrieve a copy of the extended attribute representation of the security * label associated with @name for @inode via @buffer. Note that @name is the * remainder of the attribute name after the security prefix has been removed. * @alloc is used to specify if the call should return a value via the buffer * or just the value length. * * Return: Returns size of buffer on success. */ int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_getsecurity); return call_int_hook(inode_getsecurity, idmap, inode, name, buffer, alloc); } /** * security_inode_setsecurity() - Set the xattr security label of an inode * @inode: inode * @name: xattr name * @value: security label * @size: length of security label * @flags: flags * * Set the security label associated with @name for @inode from the extended * attribute value @value. @size indicates the size of the @value in bytes. * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the * remainder of the attribute name after the security. prefix has been removed. * * Return: Returns 0 on success. */ int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_setsecurity); return call_int_hook(inode_setsecurity, inode, name, value, size, flags); } /** * security_inode_listsecurity() - List the xattr security label names * @inode: inode * @buffer: buffer * @buffer_size: size of buffer * * Copy the extended attribute names for the security labels associated with * @inode into @buffer. The maximum size of @buffer is specified by * @buffer_size. @buffer may be NULL to request the size of the buffer * required. * * Return: Returns number of bytes used/required on success. */ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_listsecurity, inode, buffer, buffer_size); } EXPORT_SYMBOL(security_inode_listsecurity); /** * security_inode_getlsmprop() - Get an inode's LSM data * @inode: inode * @prop: lsm specific information to return * * Get the lsm specific information associated with the node. */ void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop) { call_void_hook(inode_getlsmprop, inode, prop); } /** * security_inode_copy_up() - Create new creds for an overlayfs copy-up op * @src: union dentry of copy-up file * @new: newly created creds * * A file is about to be copied up from lower layer to upper layer of overlay * filesystem. Security module can prepare a set of new creds and modify as * need be and return new creds. Caller will switch to new creds temporarily to * create new file and release newly allocated creds. * * Return: Returns 0 on success or a negative error code on error. */ int security_inode_copy_up(struct dentry *src, struct cred **new) { return call_int_hook(inode_copy_up, src, new); } EXPORT_SYMBOL(security_inode_copy_up); /** * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op * @src: union dentry of copy-up file * @name: xattr name * * Filter the xattrs being copied up when a unioned file is copied up from a * lower layer to the union/overlay layer. The caller is responsible for * reading and writing the xattrs, this hook is merely a filter. * * Return: Returns 0 to accept the xattr, -ECANCELED to discard the xattr, * -EOPNOTSUPP if the security module does not know about attribute, * or a negative error code to abort the copy up. */ int security_inode_copy_up_xattr(struct dentry *src, const char *name) { int rc; rc = call_int_hook(inode_copy_up_xattr, src, name); if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr)) return rc; return LSM_RET_DEFAULT(inode_copy_up_xattr); } EXPORT_SYMBOL(security_inode_copy_up_xattr); /** * security_inode_setintegrity() - Set the inode's integrity data * @inode: inode * @type: type of integrity, e.g. hash digest, signature, etc * @value: the integrity value * @size: size of the integrity value * * Register a verified integrity measurement of a inode with LSMs. * LSMs should free the previously saved data if @value is NULL. * * Return: Returns 0 on success, negative values on failure. */ int security_inode_setintegrity(const struct inode *inode, enum lsm_integrity_type type, const void *value, size_t size) { return call_int_hook(inode_setintegrity, inode, type, value, size); } EXPORT_SYMBOL(security_inode_setintegrity); /** * security_kernfs_init_security() - Init LSM context for a kernfs node * @kn_dir: parent kernfs node * @kn: the kernfs node to initialize * * Initialize the security context of a newly created kernfs node based on its * own and its parent's attributes. * * Return: Returns 0 if permission is granted. */ int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { return call_int_hook(kernfs_init_security, kn_dir, kn); } /** * security_file_permission() - Check file permissions * @file: file * @mask: requested permissions * * Check file permissions before accessing an open file. This hook is called * by various operations that read or write files. A security module can use * this hook to perform additional checking on these operations, e.g. to * revalidate permissions on use to support privilege bracketing or policy * changes. Notice that this hook is used when the actual read/write * operations are performed, whereas the inode_security_ops hook is called when * a file is opened (as well as many other operations). Although this hook can * be used to revalidate permissions for various system call operations that * read or write files, it does not address the revalidation of permissions for * memory-mapped files. Security modules must handle this separately if they * need such revalidation. * * Return: Returns 0 if permission is granted. */ int security_file_permission(struct file *file, int mask) { return call_int_hook(file_permission, file, mask); } /** * security_file_alloc() - Allocate and init a file's LSM blob * @file: the file * * Allocate and attach a security structure to the file->f_security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if the hook is successful and permission is granted. */ int security_file_alloc(struct file *file) { int rc = lsm_file_alloc(file); if (rc) return rc; rc = call_int_hook(file_alloc_security, file); if (unlikely(rc)) security_file_free(file); return rc; } /** * security_file_release() - Perform actions before releasing the file ref * @file: the file * * Perform actions before releasing the last reference to a file. */ void security_file_release(struct file *file) { call_void_hook(file_release, file); } /** * security_file_free() - Free a file's LSM blob * @file: the file * * Deallocate and free any security structures stored in file->f_security. */ void security_file_free(struct file *file) { void *blob; call_void_hook(file_free_security, file); blob = file->f_security; if (blob) { file->f_security = NULL; kmem_cache_free(lsm_file_cache, blob); } } /** * security_file_ioctl() - Check if an ioctl is allowed * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Check permission for an ioctl operation on @file. Note that @arg sometimes * represents a user space pointer; in other cases, it may be a simple integer * value. When @arg represents a user space pointer, it should never be used * by the security module. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl); /** * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Compat version of security_file_ioctl() that correctly handles 32-bit * processes running on 64-bit kernels. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl_compat, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl_compat); static inline unsigned long mmap_prot(struct file *file, unsigned long prot) { /* * Does we have PROT_READ and does the application expect * it to imply PROT_EXEC? If not, nothing to talk about... */ if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ) return prot; if (!(current->personality & READ_IMPLIES_EXEC)) return prot; /* * if that's an anonymous mapping, let it. */ if (!file) return prot | PROT_EXEC; /* * ditto if it's not on noexec mount, except that on !MMU we need * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case */ if (!path_noexec(&file->f_path)) { #ifndef CONFIG_MMU if (file->f_op->mmap_capabilities) { unsigned caps = file->f_op->mmap_capabilities(file); if (!(caps & NOMMU_MAP_EXEC)) return prot; } #endif return prot | PROT_EXEC; } /* anything on noexec mount won't get PROT_EXEC */ return prot; } /** * security_mmap_file() - Check if mmap'ing a file is allowed * @file: file * @prot: protection applied by the kernel * @flags: flags * * Check permissions for a mmap operation. The @file may be NULL, e.g. if * mapping anonymous memory. * * Return: Returns 0 if permission is granted. */ int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags) { return call_int_hook(mmap_file, file, prot, mmap_prot(file, prot), flags); } /** * security_mmap_addr() - Check if mmap'ing an address is allowed * @addr: address * * Check permissions for a mmap operation at @addr. * * Return: Returns 0 if permission is granted. */ int security_mmap_addr(unsigned long addr) { return call_int_hook(mmap_addr, addr); } /** * security_file_mprotect() - Check if changing memory protections is allowed * @vma: memory region * @reqprot: application requested protection * @prot: protection applied by the kernel * * Check permissions before changing memory access permissions. * * Return: Returns 0 if permission is granted. */ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { return call_int_hook(file_mprotect, vma, reqprot, prot); } /** * security_file_lock() - Check if a file lock is allowed * @file: file * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK) * * Check permission before performing file locking operations. Note the hook * mediates both flock and fcntl style locks. * * Return: Returns 0 if permission is granted. */ int security_file_lock(struct file *file, unsigned int cmd) { return call_int_hook(file_lock, file, cmd); } /** * security_file_fcntl() - Check if fcntl() op is allowed * @file: file * @cmd: fcntl command * @arg: command argument * * Check permission before allowing the file operation specified by @cmd from * being performed on the file @file. Note that @arg sometimes represents a * user space pointer; in other cases, it may be a simple integer value. When * @arg represents a user space pointer, it should never be used by the * security module. * * Return: Returns 0 if permission is granted. */ int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_fcntl, file, cmd, arg); } /** * security_file_set_fowner() - Set the file owner info in the LSM blob * @file: the file * * Save owner security information (typically from current->security) in * file->f_security for later use by the send_sigiotask hook. * * This hook is called with file->f_owner.lock held. * * Return: Returns 0 on success. */ void security_file_set_fowner(struct file *file) { call_void_hook(file_set_fowner, file); } /** * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed * @tsk: target task * @fown: signal sender * @sig: signal to be sent, SIGIO is sent if 0 * * Check permission for the file owner @fown to send SIGIO or SIGURG to the * process @tsk. Note that this hook is sometimes called from interrupt. Note * that the fown_struct, @fown, is never outside the context of a struct file, * so the file structure (and associated security information) can always be * obtained: container_of(fown, struct file, f_owner). * * Return: Returns 0 if permission is granted. */ int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig) { return call_int_hook(file_send_sigiotask, tsk, fown, sig); } /** * security_file_receive() - Check if receiving a file via IPC is allowed * @file: file being received * * This hook allows security modules to control the ability of a process to * receive an open file descriptor via socket IPC. * * Return: Returns 0 if permission is granted. */ int security_file_receive(struct file *file) { return call_int_hook(file_receive, file); } /** * security_file_open() - Save open() time state for late use by the LSM * @file: * * Save open-time permission checking state for later use upon file_permission, * and recheck access if anything has changed since inode_permission. * * We can check if a file is opened for execution (e.g. execve(2) call), either * directly or indirectly (e.g. ELF's ld.so) by checking file->f_flags & * __FMODE_EXEC . * * Return: Returns 0 if permission is granted. */ int security_file_open(struct file *file) { return call_int_hook(file_open, file); } /** * security_file_post_open() - Evaluate a file after it has been opened * @file: the file * @mask: access mask * * Evaluate an opened file and the access mask requested with open(). The hook * is useful for LSMs that require the file content to be available in order to * make decisions. * * Return: Returns 0 if permission is granted. */ int security_file_post_open(struct file *file, int mask) { return call_int_hook(file_post_open, file, mask); } EXPORT_SYMBOL_GPL(security_file_post_open); /** * security_file_truncate() - Check if truncating a file is allowed * @file: file * * Check permission before truncating a file, i.e. using ftruncate. Note that * truncation permission may also be checked based on the path, using the * @path_truncate hook. * * Return: Returns 0 if permission is granted. */ int security_file_truncate(struct file *file) { return call_int_hook(file_truncate, file); } /** * security_task_alloc() - Allocate a task's LSM blob * @task: the task * @clone_flags: flags indicating what is being shared * * Handle allocation of task-related resources. * * Return: Returns a zero on success, negative values on failure. */ int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { int rc = lsm_task_alloc(task); if (rc) return rc; rc = call_int_hook(task_alloc, task, clone_flags); if (unlikely(rc)) security_task_free(task); return rc; } /** * security_task_free() - Free a task's LSM blob and related resources * @task: task * * Handle release of task-related resources. Note that this can be called from * interrupt context. */ void security_task_free(struct task_struct *task) { call_void_hook(task_free, task); kfree(task->security); task->security = NULL; } /** * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer * @cred: credentials * @gfp: gfp flags * * Only allocate sufficient memory and attach to @cred such that * cred_transfer() will not get ENOMEM. * * Return: Returns 0 on success, negative values on failure. */ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { int rc = lsm_cred_alloc(cred, gfp); if (rc) return rc; rc = call_int_hook(cred_alloc_blank, cred, gfp); if (unlikely(rc)) security_cred_free(cred); return rc; } /** * security_cred_free() - Free the cred's LSM blob and associated resources * @cred: credentials * * Deallocate and clear the cred->security field in a set of credentials. */ void security_cred_free(struct cred *cred) { /* * There is a failure case in prepare_creds() that * may result in a call here with ->security being NULL. */ if (unlikely(cred->security == NULL)) return; call_void_hook(cred_free, cred); kfree(cred->security); cred->security = NULL; } /** * security_prepare_creds() - Prepare a new set of credentials * @new: new credentials * @old: original credentials * @gfp: gfp flags * * Prepare a new set of credentials by copying the data from the old set. * * Return: Returns 0 on success, negative values on failure. */ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) { int rc = lsm_cred_alloc(new, gfp); if (rc) return rc; rc = call_int_hook(cred_prepare, new, old, gfp); if (unlikely(rc)) security_cred_free(new); return rc; } /** * security_transfer_creds() - Transfer creds * @new: target credentials * @old: original credentials * * Transfer data from original creds to new creds. */ void security_transfer_creds(struct cred *new, const struct cred *old) { call_void_hook(cred_transfer, new, old); } /** * security_cred_getsecid() - Get the secid from a set of credentials * @c: credentials * @secid: secid value * * Retrieve the security identifier of the cred structure @c. In case of * failure, @secid will be set to zero. */ void security_cred_getsecid(const struct cred *c, u32 *secid) { *secid = 0; call_void_hook(cred_getsecid, c, secid); } EXPORT_SYMBOL(security_cred_getsecid); /** * security_cred_getlsmprop() - Get the LSM data from a set of credentials * @c: credentials * @prop: destination for the LSM data * * Retrieve the security data of the cred structure @c. In case of * failure, @prop will be cleared. */ void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop) { lsmprop_init(prop); call_void_hook(cred_getlsmprop, c, prop); } EXPORT_SYMBOL(security_cred_getlsmprop); /** * security_kernel_act_as() - Set the kernel credentials to act as secid * @new: credentials * @secid: secid * * Set the credentials for a kernel service to act as (subjective context). * The current task must be the one that nominated @secid. * * Return: Returns 0 if successful. */ int security_kernel_act_as(struct cred *new, u32 secid) { return call_int_hook(kernel_act_as, new, secid); } /** * security_kernel_create_files_as() - Set file creation context using an inode * @new: target credentials * @inode: reference inode * * Set the file creation context in a set of credentials to be the same as the * objective context of the specified inode. The current task must be the one * that nominated @inode. * * Return: Returns 0 if successful. */ int security_kernel_create_files_as(struct cred *new, struct inode *inode) { return call_int_hook(kernel_create_files_as, new, inode); } /** * security_kernel_module_request() - Check if loading a module is allowed * @kmod_name: module name * * Ability to trigger the kernel to automatically upcall to userspace for * userspace to load a kernel module with the given name. * * Return: Returns 0 if successful. */ int security_kernel_module_request(char *kmod_name) { return call_int_hook(kernel_module_request, kmod_name); } /** * security_kernel_read_file() - Read a file specified by userspace * @file: file * @id: file identifier * @contents: trust if security_kernel_post_read_file() will be called * * Read a file specified by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { return call_int_hook(kernel_read_file, file, id, contents); } EXPORT_SYMBOL_GPL(security_kernel_read_file); /** * security_kernel_post_read_file() - Read a file specified by userspace * @file: file * @buf: file contents * @size: size of file contents * @id: file identifier * * Read a file specified by userspace. This must be paired with a prior call * to security_kernel_read_file() call that indicated this hook would also be * called, see security_kernel_read_file() for more information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) { return call_int_hook(kernel_post_read_file, file, buf, size, id); } EXPORT_SYMBOL_GPL(security_kernel_post_read_file); /** * security_kernel_load_data() - Load data provided by userspace * @id: data identifier * @contents: true if security_kernel_post_load_data() will be called * * Load data provided by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_load_data(enum kernel_load_data_id id, bool contents) { return call_int_hook(kernel_load_data, id, contents); } EXPORT_SYMBOL_GPL(security_kernel_load_data); /** * security_kernel_post_load_data() - Load userspace data from a non-file source * @buf: data * @size: size of data * @id: data identifier * @description: text description of data, specific to the id value * * Load data provided by a non-file source (usually userspace buffer). This * must be paired with a prior security_kernel_load_data() call that indicated * this hook would also be called, see security_kernel_load_data() for more * information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description) { return call_int_hook(kernel_post_load_data, buf, size, id, description); } EXPORT_SYMBOL_GPL(security_kernel_post_load_data); /** * security_task_fix_setuid() - Update LSM with new user id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag values * * Update the module's state after setting one or more of the user identity * attributes of the current process. The @flags parameter indicates which of * the set*uid system calls invoked this hook. If @new is the set of * credentials that will be installed. Modifications should be made to this * rather than to @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setuid, new, old, flags); } /** * security_task_fix_setgid() - Update LSM with new group id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag value * * Update the module's state after setting one or more of the group identity * attributes of the current process. The @flags parameter indicates which of * the set*gid system calls invoked this hook. @new is the set of credentials * that will be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setgid, new, old, flags); } /** * security_task_fix_setgroups() - Update LSM with new supplementary groups * @new: updated credentials * @old: credentials being replaced * * Update the module's state after setting the supplementary group identity * attributes of the current process. @new is the set of credentials that will * be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgroups(struct cred *new, const struct cred *old) { return call_int_hook(task_fix_setgroups, new, old); } /** * security_task_setpgid() - Check if setting the pgid is allowed * @p: task being modified * @pgid: new pgid * * Check permission before setting the process group identifier of the process * @p to @pgid. * * Return: Returns 0 if permission is granted. */ int security_task_setpgid(struct task_struct *p, pid_t pgid) { return call_int_hook(task_setpgid, p, pgid); } /** * security_task_getpgid() - Check if getting the pgid is allowed * @p: task * * Check permission before getting the process group identifier of the process * @p. * * Return: Returns 0 if permission is granted. */ int security_task_getpgid(struct task_struct *p) { return call_int_hook(task_getpgid, p); } /** * security_task_getsid() - Check if getting the session id is allowed * @p: task * * Check permission before getting the session identifier of the process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getsid(struct task_struct *p) { return call_int_hook(task_getsid, p); } /** * security_current_getlsmprop_subj() - Current task's subjective LSM data * @prop: lsm specific information * * Retrieve the subjective security identifier of the current task and return * it in @prop. */ void security_current_getlsmprop_subj(struct lsm_prop *prop) { lsmprop_init(prop); call_void_hook(current_getlsmprop_subj, prop); } EXPORT_SYMBOL(security_current_getlsmprop_subj); /** * security_task_getlsmprop_obj() - Get a task's objective LSM data * @p: target task * @prop: lsm specific information * * Retrieve the objective security identifier of the task_struct in @p and * return it in @prop. */ void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop) { lsmprop_init(prop); call_void_hook(task_getlsmprop_obj, p, prop); } EXPORT_SYMBOL(security_task_getlsmprop_obj); /** * security_task_setnice() - Check if setting a task's nice value is allowed * @p: target task * @nice: nice value * * Check permission before setting the nice value of @p to @nice. * * Return: Returns 0 if permission is granted. */ int security_task_setnice(struct task_struct *p, int nice) { return call_int_hook(task_setnice, p, nice); } /** * security_task_setioprio() - Check if setting a task's ioprio is allowed * @p: target task * @ioprio: ioprio value * * Check permission before setting the ioprio value of @p to @ioprio. * * Return: Returns 0 if permission is granted. */ int security_task_setioprio(struct task_struct *p, int ioprio) { return call_int_hook(task_setioprio, p, ioprio); } /** * security_task_getioprio() - Check if getting a task's ioprio is allowed * @p: task * * Check permission before getting the ioprio value of @p. * * Return: Returns 0 if permission is granted. */ int security_task_getioprio(struct task_struct *p) { return call_int_hook(task_getioprio, p); } /** * security_task_prlimit() - Check if get/setting resources limits is allowed * @cred: current task credentials * @tcred: target task credentials * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both * * Check permission before getting and/or setting the resource limits of * another task. * * Return: Returns 0 if permission is granted. */ int security_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { return call_int_hook(task_prlimit, cred, tcred, flags); } /** * security_task_setrlimit() - Check if setting a new rlimit value is allowed * @p: target task's group leader * @resource: resource whose limit is being set * @new_rlim: new resource limit * * Check permission before setting the resource limits of process @p for * @resource to @new_rlim. The old resource limit values can be examined by * dereferencing (p->signal->rlim + resource). * * Return: Returns 0 if permission is granted. */ int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { return call_int_hook(task_setrlimit, p, resource, new_rlim); } /** * security_task_setscheduler() - Check if setting sched policy/param is allowed * @p: target task * * Check permission before setting scheduling policy and/or parameters of * process @p. * * Return: Returns 0 if permission is granted. */ int security_task_setscheduler(struct task_struct *p) { return call_int_hook(task_setscheduler, p); } /** * security_task_getscheduler() - Check if getting scheduling info is allowed * @p: target task * * Check permission before obtaining scheduling information for process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getscheduler(struct task_struct *p) { return call_int_hook(task_getscheduler, p); } /** * security_task_movememory() - Check if moving memory is allowed * @p: task * * Check permission before moving memory owned by process @p. * * Return: Returns 0 if permission is granted. */ int security_task_movememory(struct task_struct *p) { return call_int_hook(task_movememory, p); } /** * security_task_kill() - Check if sending a signal is allowed * @p: target process * @info: signal information * @sig: signal value * @cred: credentials of the signal sender, NULL if @current * * Check permission before sending signal @sig to @p. @info can be NULL, the * constant 1, or a pointer to a kernel_siginfo structure. If @info is 1 or * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from * the kernel and should typically be permitted. SIGIO signals are handled * separately by the send_sigiotask hook in file_security_ops. * * Return: Returns 0 if permission is granted. */ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { return call_int_hook(task_kill, p, info, sig, cred); } /** * security_task_prctl() - Check if a prctl op is allowed * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Check permission before performing a process control operation on the * current process. * * Return: Return -ENOSYS if no-one wanted to handle this op, any other value * to cause prctl() to return immediately with that value. */ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int thisrc; int rc = LSM_RET_DEFAULT(task_prctl); struct lsm_static_call *scall; lsm_for_each_hook(scall, task_prctl) { thisrc = scall->hl->hook.task_prctl(option, arg2, arg3, arg4, arg5); if (thisrc != LSM_RET_DEFAULT(task_prctl)) { rc = thisrc; if (thisrc != 0) break; } } return rc; } /** * security_task_to_inode() - Set the security attributes of a task's inode * @p: task * @inode: inode * * Set the security attributes for an inode based on an associated task's * security attributes, e.g. for /proc/pid inodes. */ void security_task_to_inode(struct task_struct *p, struct inode *inode) { call_void_hook(task_to_inode, p, inode); } /** * security_create_user_ns() - Check if creating a new userns is allowed * @cred: prepared creds * * Check permission prior to creating a new user namespace. * * Return: Returns 0 if successful, otherwise < 0 error code. */ int security_create_user_ns(const struct cred *cred) { return call_int_hook(userns_create, cred); } /** * security_ipc_permission() - Check if sysv ipc access is allowed * @ipcp: ipc permission structure * @flag: requested permissions * * Check permissions for access to IPC. * * Return: Returns 0 if permission is granted. */ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return call_int_hook(ipc_permission, ipcp, flag); } /** * security_ipc_getlsmprop() - Get the sysv ipc object LSM data * @ipcp: ipc permission structure * @prop: pointer to lsm information * * Get the lsm information associated with the ipc object. */ void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop) { lsmprop_init(prop); call_void_hook(ipc_getlsmprop, ipcp, prop); } /** * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob * @msg: message structure * * Allocate and attach a security structure to the msg->security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if operation was successful and permission is granted. */ int security_msg_msg_alloc(struct msg_msg *msg) { int rc = lsm_msg_msg_alloc(msg); if (unlikely(rc)) return rc; rc = call_int_hook(msg_msg_alloc_security, msg); if (unlikely(rc)) security_msg_msg_free(msg); return rc; } /** * security_msg_msg_free() - Free a sysv ipc message LSM blob * @msg: message structure * * Deallocate the security structure for this message. */ void security_msg_msg_free(struct msg_msg *msg) { call_void_hook(msg_msg_free_security, msg); kfree(msg->security); msg->security = NULL; } /** * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Allocate and attach a security structure to @msg. The security field is * initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_msg_queue_alloc(struct kern_ipc_perm *msq) { int rc = lsm_ipc_alloc(msq); if (unlikely(rc)) return rc; rc = call_int_hook(msg_queue_alloc_security, msq); if (unlikely(rc)) security_msg_queue_free(msq); return rc; } /** * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Deallocate security field @perm->security for the message queue. */ void security_msg_queue_free(struct kern_ipc_perm *msq) { call_void_hook(msg_queue_free_security, msq); kfree(msq->security); msq->security = NULL; } /** * security_msg_queue_associate() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @msqflg: operation flags * * Check permission when a message queue is requested through the msgget system * call. This hook is only called when returning the message queue identifier * for an existing message queue, not when a new message queue is created. * * Return: Return 0 if permission is granted. */ int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { return call_int_hook(msg_queue_associate, msq, msqflg); } /** * security_msg_queue_msgctl() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @cmd: operation * * Check permission when a message control operation specified by @cmd is to be * performed on the message queue with permissions. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { return call_int_hook(msg_queue_msgctl, msq, cmd); } /** * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed * @msq: sysv ipc permission structure * @msg: message * @msqflg: operation flags * * Check permission before a message, @msg, is enqueued on the message queue * with permissions specified in @msq. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { return call_int_hook(msg_queue_msgsnd, msq, msg, msqflg); } /** * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed * @msq: sysv ipc permission structure * @msg: message * @target: target task * @type: type of message requested * @mode: operation flags * * Check permission before a message, @msg, is removed from the message queue. * The @target task structure contains a pointer to the process that will be * receiving the message (not equal to the current process when inline receives * are being performed). * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { return call_int_hook(msg_queue_msgrcv, msq, msg, target, type, mode); } /** * security_shm_alloc() - Allocate a sysv shm LSM blob * @shp: sysv ipc permission structure * * Allocate and attach a security structure to the @shp security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_shm_alloc(struct kern_ipc_perm *shp) { int rc = lsm_ipc_alloc(shp); if (unlikely(rc)) return rc; rc = call_int_hook(shm_alloc_security, shp); if (unlikely(rc)) security_shm_free(shp); return rc; } /** * security_shm_free() - Free a sysv shm LSM blob * @shp: sysv ipc permission structure * * Deallocate the security structure @perm->security for the memory segment. */ void security_shm_free(struct kern_ipc_perm *shp) { call_void_hook(shm_free_security, shp); kfree(shp->security); shp->security = NULL; } /** * security_shm_associate() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @shmflg: operation flags * * Check permission when a shared memory region is requested through the shmget * system call. This hook is only called when returning the shared memory * region identifier for an existing region, not when a new shared memory * region is created. * * Return: Returns 0 if permission is granted. */ int security_shm_associate(struct kern_ipc_perm *shp, int shmflg) { return call_int_hook(shm_associate, shp, shmflg); } /** * security_shm_shmctl() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @cmd: operation * * Check permission when a shared memory control operation specified by @cmd is * to be performed on the shared memory region with permissions in @shp. * * Return: Return 0 if permission is granted. */ int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { return call_int_hook(shm_shmctl, shp, cmd); } /** * security_shm_shmat() - Check if a sysv shm attach operation is allowed * @shp: sysv ipc permission structure * @shmaddr: address of memory region to attach * @shmflg: operation flags * * Check permissions prior to allowing the shmat system call to attach the * shared memory segment with permissions @shp to the data segment of the * calling process. The attaching address is specified by @shmaddr. * * Return: Returns 0 if permission is granted. */ int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { return call_int_hook(shm_shmat, shp, shmaddr, shmflg); } /** * security_sem_alloc() - Allocate a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Allocate and attach a security structure to the @sma security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_sem_alloc(struct kern_ipc_perm *sma) { int rc = lsm_ipc_alloc(sma); if (unlikely(rc)) return rc; rc = call_int_hook(sem_alloc_security, sma); if (unlikely(rc)) security_sem_free(sma); return rc; } /** * security_sem_free() - Free a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Deallocate security structure @sma->security for the semaphore. */ void security_sem_free(struct kern_ipc_perm *sma) { call_void_hook(sem_free_security, sma); kfree(sma->security); sma->security = NULL; } /** * security_sem_associate() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @semflg: operation flags * * Check permission when a semaphore is requested through the semget system * call. This hook is only called when returning the semaphore identifier for * an existing semaphore, not when a new one must be created. * * Return: Returns 0 if permission is granted. */ int security_sem_associate(struct kern_ipc_perm *sma, int semflg) { return call_int_hook(sem_associate, sma, semflg); } /** * security_sem_semctl() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @cmd: operation * * Check permission when a semaphore operation specified by @cmd is to be * performed on the semaphore. * * Return: Returns 0 if permission is granted. */ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd) { return call_int_hook(sem_semctl, sma, cmd); } /** * security_sem_semop() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @sops: operations to perform * @nsops: number of operations * @alter: flag indicating changes will be made * * Check permissions before performing operations on members of the semaphore * set. If the @alter flag is nonzero, the semaphore set may be modified. * * Return: Returns 0 if permission is granted. */ int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { return call_int_hook(sem_semop, sma, sops, nsops, alter); } /** * security_d_instantiate() - Populate an inode's LSM state based on a dentry * @dentry: dentry * @inode: inode * * Fill in @inode security information for a @dentry if allowed. */ void security_d_instantiate(struct dentry *dentry, struct inode *inode) { if (unlikely(inode && IS_PRIVATE(inode))) return; call_void_hook(d_instantiate, dentry, inode); } EXPORT_SYMBOL(security_d_instantiate); /* * Please keep this in sync with it's counterpart in security/lsm_syscalls.c */ /** * security_getselfattr - Read an LSM attribute of the current process. * @attr: which attribute to return * @uctx: the user-space destination for the information, or NULL * @size: pointer to the size of space available to receive the data * @flags: special handling options. LSM_FLAG_SINGLE indicates that only * attributes associated with the LSM identified in the passed @ctx be * reported. * * A NULL value for @uctx can be used to get both the number of attributes * and the size of the data. * * Returns the number of attributes found on success, negative value * on error. @size is reset to the total size of the data. * If @size is insufficient to contain the data -E2BIG is returned. */ int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx, u32 __user *size, u32 flags) { struct lsm_static_call *scall; struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, }; u8 __user *base = (u8 __user *)uctx; u32 entrysize; u32 total = 0; u32 left; bool toobig = false; bool single = false; int count = 0; int rc; if (attr == LSM_ATTR_UNDEF) return -EINVAL; if (size == NULL) return -EINVAL; if (get_user(left, size)) return -EFAULT; if (flags) { /* * Only flag supported is LSM_FLAG_SINGLE */ if (flags != LSM_FLAG_SINGLE || !uctx) return -EINVAL; if (copy_from_user(&lctx, uctx, sizeof(lctx))) return -EFAULT; /* * If the LSM ID isn't specified it is an error. */ if (lctx.id == LSM_ID_UNDEF) return -EINVAL; single = true; } /* * In the usual case gather all the data from the LSMs. * In the single case only get the data from the LSM specified. */ lsm_for_each_hook(scall, getselfattr) { if (single && lctx.id != scall->hl->lsmid->id) continue; entrysize = left; if (base) uctx = (struct lsm_ctx __user *)(base + total); rc = scall->hl->hook.getselfattr(attr, uctx, &entrysize, flags); if (rc == -EOPNOTSUPP) continue; if (rc == -E2BIG) { rc = 0; left = 0; toobig = true; } else if (rc < 0) return rc; else left -= entrysize; total += entrysize; count += rc; if (single) break; } if (put_user(total, size)) return -EFAULT; if (toobig) return -E2BIG; if (count == 0) return LSM_RET_DEFAULT(getselfattr); return count; } /* * Please keep this in sync with it's counterpart in security/lsm_syscalls.c */ /** * security_setselfattr - Set an LSM attribute on the current process. * @attr: which attribute to set * @uctx: the user-space source for the information * @size: the size of the data * @flags: reserved for future use, must be 0 * * Set an LSM attribute for the current process. The LSM, attribute * and new value are included in @uctx. * * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT * if the user buffer is inaccessible, E2BIG if size is too big, or an * LSM specific failure. */ int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx, u32 size, u32 flags) { struct lsm_static_call *scall; struct lsm_ctx *lctx; int rc = LSM_RET_DEFAULT(setselfattr); u64 required_len; if (flags) return -EINVAL; if (size < sizeof(*lctx)) return -EINVAL; if (size > PAGE_SIZE) return -E2BIG; lctx = memdup_user(uctx, size); if (IS_ERR(lctx)) return PTR_ERR(lctx); if (size < lctx->len || check_add_overflow(sizeof(*lctx), lctx->ctx_len, &required_len) || lctx->len < required_len) { rc = -EINVAL; goto free_out; } lsm_for_each_hook(scall, setselfattr) if ((scall->hl->lsmid->id) == lctx->id) { rc = scall->hl->hook.setselfattr(attr, lctx, size, flags); break; } free_out: kfree(lctx); return rc; } /** * security_getprocattr() - Read an attribute for a task * @p: the task * @lsmid: LSM identification * @name: attribute name * @value: attribute value * * Read attribute @name for task @p and store it into @value if allowed. * * Return: Returns the length of @value on success, a negative value otherwise. */ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { struct lsm_static_call *scall; lsm_for_each_hook(scall, getprocattr) { if (lsmid != 0 && lsmid != scall->hl->lsmid->id) continue; return scall->hl->hook.getprocattr(p, name, value); } return LSM_RET_DEFAULT(getprocattr); } /** * security_setprocattr() - Set an attribute for a task * @lsmid: LSM identification * @name: attribute name * @value: attribute value * @size: attribute value size * * Write (set) the current task's attribute @name to @value, size @size if * allowed. * * Return: Returns bytes written on success, a negative value otherwise. */ int security_setprocattr(int lsmid, const char *name, void *value, size_t size) { struct lsm_static_call *scall; lsm_for_each_hook(scall, setprocattr) { if (lsmid != 0 && lsmid != scall->hl->lsmid->id) continue; return scall->hl->hook.setprocattr(name, value, size); } return LSM_RET_DEFAULT(setprocattr); } /** * security_netlink_send() - Save info and check if netlink sending is allowed * @sk: sending socket * @skb: netlink message * * Save security information for a netlink message so that permission checking * can be performed when the message is processed. The security information * can be saved using the eff_cap field of the netlink_skb_parms structure. * Also may be used to provide fine grained control over message transmission. * * Return: Returns 0 if the information was successfully saved and message is * allowed to be transmitted. */ int security_netlink_send(struct sock *sk, struct sk_buff *skb) { return call_int_hook(netlink_send, sk, skb); } /** * security_ismaclabel() - Check if the named attribute is a MAC label * @name: full extended attribute name * * Check if the extended attribute specified by @name represents a MAC label. * * Return: Returns 1 if name is a MAC attribute otherwise returns 0. */ int security_ismaclabel(const char *name) { return call_int_hook(ismaclabel, name); } EXPORT_SYMBOL(security_ismaclabel); /** * security_secid_to_secctx() - Convert a secid to a secctx * @secid: secid * @cp: the LSM context * * Convert secid to security context. If @cp is NULL the length of the * result will be returned, but no data will be returned. This * does mean that the length could change between calls to check the length and * the next call which actually allocates and returns the data. * * Return: Return length of data on success, error on failure. */ int security_secid_to_secctx(u32 secid, struct lsm_context *cp) { return call_int_hook(secid_to_secctx, secid, cp); } EXPORT_SYMBOL(security_secid_to_secctx); /** * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx * @prop: lsm specific information * @cp: the LSM context * * Convert a @prop entry to security context. If @cp is NULL the * length of the result will be returned. This does mean that the * length could change between calls to check the length and the * next call which actually allocates and returns the @cp. * * Return: Return length of data on success, error on failure. */ int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) { return call_int_hook(lsmprop_to_secctx, prop, cp); } EXPORT_SYMBOL(security_lsmprop_to_secctx); /** * security_secctx_to_secid() - Convert a secctx to a secid * @secdata: secctx * @seclen: length of secctx * @secid: secid * * Convert security context to secid. * * Return: Returns 0 on success, error on failure. */ int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { *secid = 0; return call_int_hook(secctx_to_secid, secdata, seclen, secid); } EXPORT_SYMBOL(security_secctx_to_secid); /** * security_release_secctx() - Free a secctx buffer * @cp: the security context * * Release the security context. */ void security_release_secctx(struct lsm_context *cp) { call_void_hook(release_secctx, cp); memset(cp, 0, sizeof(*cp)); } EXPORT_SYMBOL(security_release_secctx); /** * security_inode_invalidate_secctx() - Invalidate an inode's security label * @inode: inode * * Notify the security module that it must revalidate the security context of * an inode. */ void security_inode_invalidate_secctx(struct inode *inode) { call_void_hook(inode_invalidate_secctx, inode); } EXPORT_SYMBOL(security_inode_invalidate_secctx); /** * security_inode_notifysecctx() - Notify the LSM of an inode's security label * @inode: inode * @ctx: secctx * @ctxlen: length of secctx * * Notify the security module of what the security context of an inode should * be. Initializes the incore security context managed by the security module * for this inode. Example usage: NFS client invokes this hook to initialize * the security context in its incore inode to the value provided by the server * for the file when the server returned the file's attributes to the client. * Must be called with inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return call_int_hook(inode_notifysecctx, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_notifysecctx); /** * security_inode_setsecctx() - Change the security label of an inode * @dentry: inode * @ctx: secctx * @ctxlen: length of secctx * * Change the security context of an inode. Updates the incore security * context managed by the security module and invokes the fs code as needed * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the * context. Example usage: NFS server invokes this hook to change the security * context in its incore inode and on the backing filesystem to a value * provided by the client on a SETATTR operation. Must be called with * inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return call_int_hook(inode_setsecctx, dentry, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_setsecctx); /** * security_inode_getsecctx() - Get the security label of an inode * @inode: inode * @cp: security context * * On success, returns 0 and fills out @cp with the security context * for the given @inode. * * Return: Returns 0 on success, error on failure. */ int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp) { memset(cp, 0, sizeof(*cp)); return call_int_hook(inode_getsecctx, inode, cp); } EXPORT_SYMBOL(security_inode_getsecctx); #ifdef CONFIG_WATCH_QUEUE /** * security_post_notification() - Check if a watch notification can be posted * @w_cred: credentials of the task that set the watch * @cred: credentials of the task which triggered the watch * @n: the notification * * Check to see if a watch notification can be posted to a particular queue. * * Return: Returns 0 if permission is granted. */ int security_post_notification(const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) { return call_int_hook(post_notification, w_cred, cred, n); } #endif /* CONFIG_WATCH_QUEUE */ #ifdef CONFIG_KEY_NOTIFICATIONS /** * security_watch_key() - Check if a task is allowed to watch for key events * @key: the key to watch * * Check to see if a process is allowed to watch for event notifications from * a key or keyring. * * Return: Returns 0 if permission is granted. */ int security_watch_key(struct key *key) { return call_int_hook(watch_key, key); } #endif /* CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK /** * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed * @sock: originating sock * @other: peer sock * @newsk: new sock * * Check permissions before establishing a Unix domain stream connection * between @sock and @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { return call_int_hook(unix_stream_connect, sock, other, newsk); } EXPORT_SYMBOL(security_unix_stream_connect); /** * security_unix_may_send() - Check if AF_UNIX socket can send datagrams * @sock: originating sock * @other: peer sock * * Check permissions before connecting or sending datagrams from @sock to * @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_may_send(struct socket *sock, struct socket *other) { return call_int_hook(unix_may_send, sock, other); } EXPORT_SYMBOL(security_unix_may_send); /** * security_socket_create() - Check if creating a new socket is allowed * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * Check permissions prior to creating a new socket. * * Return: Returns 0 if permission is granted. */ int security_socket_create(int family, int type, int protocol, int kern) { return call_int_hook(socket_create, family, type, protocol, kern); } /** * security_socket_post_create() - Initialize a newly created socket * @sock: socket * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * This hook allows a module to update or allocate a per-socket security * structure. Note that the security field was not added directly to the socket * structure, but rather, the socket security information is stored in the * associated inode. Typically, the inode alloc_security hook will allocate * and attach security information to SOCK_INODE(sock)->i_security. This hook * may be used to update the SOCK_INODE(sock)->i_security field with additional * information that wasn't available when the inode was allocated. * * Return: Returns 0 if permission is granted. */ int security_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { return call_int_hook(socket_post_create, sock, family, type, protocol, kern); } /** * security_socket_socketpair() - Check if creating a socketpair is allowed * @socka: first socket * @sockb: second socket * * Check permissions before creating a fresh pair of sockets. * * Return: Returns 0 if permission is granted and the connection was * established. */ int security_socket_socketpair(struct socket *socka, struct socket *sockb) { return call_int_hook(socket_socketpair, socka, sockb); } EXPORT_SYMBOL(security_socket_socketpair); /** * security_socket_bind() - Check if a socket bind operation is allowed * @sock: socket * @address: requested bind address * @addrlen: length of address * * Check permission before socket protocol layer bind operation is performed * and the socket @sock is bound to the address specified in the @address * parameter. * * Return: Returns 0 if permission is granted. */ int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_bind, sock, address, addrlen); } /** * security_socket_connect() - Check if a socket connect operation is allowed * @sock: socket * @address: address of remote connection point * @addrlen: length of address * * Check permission before socket protocol layer connect operation attempts to * connect socket @sock to a remote address, @address. * * Return: Returns 0 if permission is granted. */ int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_connect, sock, address, addrlen); } /** * security_socket_listen() - Check if a socket is allowed to listen * @sock: socket * @backlog: connection queue size * * Check permission before socket protocol layer listen operation. * * Return: Returns 0 if permission is granted. */ int security_socket_listen(struct socket *sock, int backlog) { return call_int_hook(socket_listen, sock, backlog); } /** * security_socket_accept() - Check if a socket is allowed to accept connections * @sock: listening socket * @newsock: newly creation connection socket * * Check permission before accepting a new connection. Note that the new * socket, @newsock, has been created and some information copied to it, but * the accept operation has not actually been performed. * * Return: Returns 0 if permission is granted. */ int security_socket_accept(struct socket *sock, struct socket *newsock) { return call_int_hook(socket_accept, sock, newsock); } /** * security_socket_sendmsg() - Check if sending a message is allowed * @sock: sending socket * @msg: message to send * @size: size of message * * Check permission before transmitting a message to another socket. * * Return: Returns 0 if permission is granted. */ int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return call_int_hook(socket_sendmsg, sock, msg, size); } /** * security_socket_recvmsg() - Check if receiving a message is allowed * @sock: receiving socket * @msg: message to receive * @size: size of message * @flags: operational flags * * Check permission before receiving a message from a socket. * * Return: Returns 0 if permission is granted. */ int security_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return call_int_hook(socket_recvmsg, sock, msg, size, flags); } /** * security_socket_getsockname() - Check if reading the socket addr is allowed * @sock: socket * * Check permission before reading the local address (name) of the socket * object. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockname(struct socket *sock) { return call_int_hook(socket_getsockname, sock); } /** * security_socket_getpeername() - Check if reading the peer's addr is allowed * @sock: socket * * Check permission before the remote address (name) of a socket object. * * Return: Returns 0 if permission is granted. */ int security_socket_getpeername(struct socket *sock) { return call_int_hook(socket_getpeername, sock); } /** * security_socket_getsockopt() - Check if reading a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before retrieving the options associated with socket * @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_getsockopt, sock, level, optname); } /** * security_socket_setsockopt() - Check if setting a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before setting the options associated with socket @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_setsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_setsockopt, sock, level, optname); } /** * security_socket_shutdown() - Checks if shutting down the socket is allowed * @sock: socket * @how: flag indicating how sends and receives are handled * * Checks permission before all or part of a connection on the socket @sock is * shut down. * * Return: Returns 0 if permission is granted. */ int security_socket_shutdown(struct socket *sock, int how) { return call_int_hook(socket_shutdown, sock, how); } /** * security_sock_rcv_skb() - Check if an incoming network packet is allowed * @sk: destination sock * @skb: incoming packet * * Check permissions on incoming network packets. This hook is distinct from * Netfilter's IP input hooks since it is the first time that the incoming * sk_buff @skb has been associated with a particular socket, @sk. Must not * sleep inside this hook because some callers hold spinlocks. * * Return: Returns 0 if permission is granted. */ int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { return call_int_hook(socket_sock_rcv_skb, sk, skb); } EXPORT_SYMBOL(security_sock_rcv_skb); /** * security_socket_getpeersec_stream() - Get the remote peer label * @sock: socket * @optval: destination buffer * @optlen: size of peer label copied into the buffer * @len: maximum size of the destination buffer * * This hook allows the security module to provide peer socket security state * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC. * For tcp sockets this can be meaningful if the socket is associated with an * ipsec SA. * * Return: Returns 0 if all is well, otherwise, typical getsockopt return * values. */ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { return call_int_hook(socket_getpeersec_stream, sock, optval, optlen, len); } /** * security_socket_getpeersec_dgram() - Get the remote peer label * @sock: socket * @skb: datagram packet * @secid: remote peer label secid * * This hook allows the security module to provide peer socket security state * for udp sockets on a per-packet basis to userspace via getsockopt * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC * option via getsockopt. It can then retrieve the security state returned by * this hook for a packet via the SCM_SECURITY ancillary message type. * * Return: Returns 0 on success, error on failure. */ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { return call_int_hook(socket_getpeersec_dgram, sock, skb, secid); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); /** * lsm_sock_alloc - allocate a composite sock blob * @sock: the sock that needs a blob * @gfp: allocation mode * * Allocate the sock blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_sock_alloc(struct sock *sock, gfp_t gfp) { return lsm_blob_alloc(&sock->sk_security, blob_sizes.lbs_sock, gfp); } /** * security_sk_alloc() - Allocate and initialize a sock's LSM blob * @sk: sock * @family: protocol family * @priority: gfp flags * * Allocate and attach a security structure to the sk->sk_security field, which * is used to copy security attributes between local stream sockets. * * Return: Returns 0 on success, error on failure. */ int security_sk_alloc(struct sock *sk, int family, gfp_t priority) { int rc = lsm_sock_alloc(sk, priority); if (unlikely(rc)) return rc; rc = call_int_hook(sk_alloc_security, sk, family, priority); if (unlikely(rc)) security_sk_free(sk); return rc; } /** * security_sk_free() - Free the sock's LSM blob * @sk: sock * * Deallocate security structure. */ void security_sk_free(struct sock *sk) { call_void_hook(sk_free_security, sk); kfree(sk->sk_security); sk->sk_security = NULL; } /** * security_sk_clone() - Clone a sock's LSM state * @sk: original sock * @newsk: target sock * * Clone/copy security structure. */ void security_sk_clone(const struct sock *sk, struct sock *newsk) { call_void_hook(sk_clone_security, sk, newsk); } EXPORT_SYMBOL(security_sk_clone); /** * security_sk_classify_flow() - Set a flow's secid based on socket * @sk: original socket * @flic: target flow * * Set the target flow's secid to socket's secid. */ void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic) { call_void_hook(sk_getsecid, sk, &flic->flowic_secid); } EXPORT_SYMBOL(security_sk_classify_flow); /** * security_req_classify_flow() - Set a flow's secid based on request_sock * @req: request_sock * @flic: target flow * * Sets @flic's secid to @req's secid. */ void security_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { call_void_hook(req_classify_flow, req, flic); } EXPORT_SYMBOL(security_req_classify_flow); /** * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket * @sk: sock being grafted * @parent: target parent socket * * Sets @parent's inode secid to @sk's secid and update @sk with any necessary * LSM state from @parent. */ void security_sock_graft(struct sock *sk, struct socket *parent) { call_void_hook(sock_graft, sk, parent); } EXPORT_SYMBOL(security_sock_graft); /** * security_inet_conn_request() - Set request_sock state using incoming connect * @sk: parent listening sock * @skb: incoming connection * @req: new request_sock * * Initialize the @req LSM state based on @sk and the incoming connect in @skb. * * Return: Returns 0 if permission is granted. */ int security_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { return call_int_hook(inet_conn_request, sk, skb, req); } EXPORT_SYMBOL(security_inet_conn_request); /** * security_inet_csk_clone() - Set new sock LSM state based on request_sock * @newsk: new sock * @req: connection request_sock * * Set that LSM state of @sock using the LSM state from @req. */ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { call_void_hook(inet_csk_clone, newsk, req); } /** * security_inet_conn_established() - Update sock's LSM state with connection * @sk: sock * @skb: connection packet * * Update @sock's LSM state to represent a new connection from @skb. */ void security_inet_conn_established(struct sock *sk, struct sk_buff *skb) { call_void_hook(inet_conn_established, sk, skb); } EXPORT_SYMBOL(security_inet_conn_established); /** * security_secmark_relabel_packet() - Check if setting a secmark is allowed * @secid: new secmark value * * Check if the process should be allowed to relabel packets to @secid. * * Return: Returns 0 if permission is granted. */ int security_secmark_relabel_packet(u32 secid) { return call_int_hook(secmark_relabel_packet, secid); } EXPORT_SYMBOL(security_secmark_relabel_packet); /** * security_secmark_refcount_inc() - Increment the secmark labeling rule count * * Tells the LSM to increment the number of secmark labeling rules loaded. */ void security_secmark_refcount_inc(void) { call_void_hook(secmark_refcount_inc); } EXPORT_SYMBOL(security_secmark_refcount_inc); /** * security_secmark_refcount_dec() - Decrement the secmark labeling rule count * * Tells the LSM to decrement the number of secmark labeling rules loaded. */ void security_secmark_refcount_dec(void) { call_void_hook(secmark_refcount_dec); } EXPORT_SYMBOL(security_secmark_refcount_dec); /** * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device * @security: pointer to the LSM blob * * This hook allows a module to allocate a security structure for a TUN device, * returning the pointer in @security. * * Return: Returns a zero on success, negative values on failure. */ int security_tun_dev_alloc_security(void **security) { int rc; rc = lsm_blob_alloc(security, blob_sizes.lbs_tun_dev, GFP_KERNEL); if (rc) return rc; rc = call_int_hook(tun_dev_alloc_security, *security); if (rc) { kfree(*security); *security = NULL; } return rc; } EXPORT_SYMBOL(security_tun_dev_alloc_security); /** * security_tun_dev_free_security() - Free a TUN device LSM blob * @security: LSM blob * * This hook allows a module to free the security structure for a TUN device. */ void security_tun_dev_free_security(void *security) { kfree(security); } EXPORT_SYMBOL(security_tun_dev_free_security); /** * security_tun_dev_create() - Check if creating a TUN device is allowed * * Check permissions prior to creating a new TUN device. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_create(void) { return call_int_hook(tun_dev_create); } EXPORT_SYMBOL(security_tun_dev_create); /** * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed * @security: TUN device LSM blob * * Check permissions prior to attaching to a TUN device queue. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach_queue(void *security) { return call_int_hook(tun_dev_attach_queue, security); } EXPORT_SYMBOL(security_tun_dev_attach_queue); /** * security_tun_dev_attach() - Update TUN device LSM state on attach * @sk: associated sock * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's sock structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach(struct sock *sk, void *security) { return call_int_hook(tun_dev_attach, sk, security); } EXPORT_SYMBOL(security_tun_dev_attach); /** * security_tun_dev_open() - Update TUN device LSM state on open * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's security structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_open(void *security) { return call_int_hook(tun_dev_open, security); } EXPORT_SYMBOL(security_tun_dev_open); /** * security_sctp_assoc_request() - Update the LSM on a SCTP association req * @asoc: SCTP association * @skb: packet requesting the association * * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM. * * Return: Returns 0 on success, error on failure. */ int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_request, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); /** * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option * @sk: socket * @optname: SCTP option to validate * @address: list of IP addresses to validate * @addrlen: length of the address list * * Validiate permissions required for each address associated with sock @sk. * Depending on @optname, the addresses will be treated as either a connect or * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6). * * Return: Returns 0 on success, error on failure. */ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { return call_int_hook(sctp_bind_connect, sk, optname, address, addrlen); } EXPORT_SYMBOL(security_sctp_bind_connect); /** * security_sctp_sk_clone() - Clone a SCTP sock's LSM state * @asoc: SCTP association * @sk: original sock * @newsk: target sock * * Called whenever a new socket is created by accept(2) (i.e. a TCP style * socket) or when a socket is 'peeled off' e.g userspace calls * sctp_peeloff(3). */ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { call_void_hook(sctp_sk_clone, asoc, sk, newsk); } EXPORT_SYMBOL(security_sctp_sk_clone); /** * security_sctp_assoc_established() - Update LSM state when assoc established * @asoc: SCTP association * @skb: packet establishing the association * * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the * security module. * * Return: Returns 0 if permission is granted. */ int security_sctp_assoc_established(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_established, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_established); /** * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket * @sk: the owning MPTCP socket * @ssk: the new subflow * * Update the labeling for the given MPTCP subflow, to match the one of the * owning MPTCP socket. This hook has to be called after the socket creation and * initialization via the security_socket_create() and * security_socket_post_create() LSM hooks. * * Return: Returns 0 on success or a negative error code on failure. */ int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk) { return call_int_hook(mptcp_add_subflow, sk, ssk); } #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND /** * security_ib_pkey_access() - Check if access to an IB pkey is allowed * @sec: LSM blob * @subnet_prefix: subnet prefix of the port * @pkey: IB pkey * * Check permission to access a pkey when modifying a QP. * * Return: Returns 0 if permission is granted. */ int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey) { return call_int_hook(ib_pkey_access, sec, subnet_prefix, pkey); } EXPORT_SYMBOL(security_ib_pkey_access); /** * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed * @sec: LSM blob * @dev_name: IB device name * @port_num: port number * * Check permissions to send and receive SMPs on a end port. * * Return: Returns 0 if permission is granted. */ int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num) { return call_int_hook(ib_endport_manage_subnet, sec, dev_name, port_num); } EXPORT_SYMBOL(security_ib_endport_manage_subnet); /** * security_ib_alloc_security() - Allocate an Infiniband LSM blob * @sec: LSM blob * * Allocate a security structure for Infiniband objects. * * Return: Returns 0 on success, non-zero on failure. */ int security_ib_alloc_security(void **sec) { int rc; rc = lsm_blob_alloc(sec, blob_sizes.lbs_ib, GFP_KERNEL); if (rc) return rc; rc = call_int_hook(ib_alloc_security, *sec); if (rc) { kfree(*sec); *sec = NULL; } return rc; } EXPORT_SYMBOL(security_ib_alloc_security); /** * security_ib_free_security() - Free an Infiniband LSM blob * @sec: LSM blob * * Deallocate an Infiniband security structure. */ void security_ib_free_security(void *sec) { kfree(sec); } EXPORT_SYMBOL(security_ib_free_security); #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM /** * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob * @ctxp: xfrm security context being added to the SPD * @sec_ctx: security label provided by userspace * @gfp: gfp flags * * Allocate a security structure to the xp->security field; the security field * is initialized to NULL when the xfrm_policy is allocated. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) { return call_int_hook(xfrm_policy_alloc_security, ctxp, sec_ctx, gfp); } EXPORT_SYMBOL(security_xfrm_policy_alloc); /** * security_xfrm_policy_clone() - Clone xfrm policy LSM state * @old_ctx: xfrm security context * @new_ctxp: target xfrm security context * * Allocate a security structure in new_ctxp that contains the information from * the old_ctx structure. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp) { return call_int_hook(xfrm_policy_clone_security, old_ctx, new_ctxp); } /** * security_xfrm_policy_free() - Free a xfrm security context * @ctx: xfrm security context * * Free LSM resources associated with @ctx. */ void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx) { call_void_hook(xfrm_policy_free_security, ctx); } EXPORT_SYMBOL(security_xfrm_policy_free); /** * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed * @ctx: xfrm security context * * Authorize deletion of a SPD entry. * * Return: Returns 0 if permission is granted. */ int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { return call_int_hook(xfrm_policy_delete_security, ctx); } /** * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @sec_ctx: security label provided by userspace * * Allocate a security structure to the @x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to @sec_ctx. * * Return: Return 0 if operation was successful. */ int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return call_int_hook(xfrm_state_alloc, x, sec_ctx); } EXPORT_SYMBOL(security_xfrm_state_alloc); /** * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @polsec: associated policy's security context * @secid: secid from the flow * * Allocate a security structure to the x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to secid. * * Return: Returns 0 if operation was successful. */ int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) { return call_int_hook(xfrm_state_alloc_acquire, x, polsec, secid); } /** * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed * @x: xfrm state * * Authorize deletion of x->security. * * Return: Returns 0 if permission is granted. */ int security_xfrm_state_delete(struct xfrm_state *x) { return call_int_hook(xfrm_state_delete_security, x); } EXPORT_SYMBOL(security_xfrm_state_delete); /** * security_xfrm_state_free() - Free a xfrm state * @x: xfrm state * * Deallocate x->security. */ void security_xfrm_state_free(struct xfrm_state *x) { call_void_hook(xfrm_state_free_security, x); } /** * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed * @ctx: target xfrm security context * @fl_secid: flow secid used to authorize access * * Check permission when a flow selects a xfrm_policy for processing XFRMs on a * packet. The hook is called when selecting either a per-socket policy or a * generic xfrm policy. * * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on * other errors. */ int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { return call_int_hook(xfrm_policy_lookup, ctx, fl_secid); } /** * security_xfrm_state_pol_flow_match() - Check for a xfrm match * @x: xfrm state to match * @xp: xfrm policy to check for a match * @flic: flow to check for a match. * * Check @xp and @flic for a match with @x. * * Return: Returns 1 if there is a match. */ int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) { struct lsm_static_call *scall; int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match); /* * Since this function is expected to return 0 or 1, the judgment * becomes difficult if multiple LSMs supply this call. Fortunately, * we can use the first LSM's judgment because currently only SELinux * supplies this call. * * For speed optimization, we explicitly break the loop rather than * using the macro */ lsm_for_each_hook(scall, xfrm_state_pol_flow_match) { rc = scall->hl->hook.xfrm_state_pol_flow_match(x, xp, flic); break; } return rc; } /** * security_xfrm_decode_session() - Determine the xfrm secid for a packet * @skb: xfrm packet * @secid: secid * * Decode the packet in @skb and return the security label in @secid. * * Return: Return 0 if all xfrms used have the same secid. */ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) { return call_int_hook(xfrm_decode_session, skb, secid, 1); } void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic) { int rc = call_int_hook(xfrm_decode_session, skb, &flic->flowic_secid, 0); BUG_ON(rc); } EXPORT_SYMBOL(security_skb_classify_flow); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS /** * security_key_alloc() - Allocate and initialize a kernel key LSM blob * @key: key * @cred: credentials * @flags: allocation flags * * Permit allocation of a key and assign security data. Note that key does not * have a serial number assigned at this point. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { int rc = lsm_key_alloc(key); if (unlikely(rc)) return rc; rc = call_int_hook(key_alloc, key, cred, flags); if (unlikely(rc)) security_key_free(key); return rc; } /** * security_key_free() - Free a kernel key LSM blob * @key: key * * Notification of destruction; free security data. */ void security_key_free(struct key *key) { kfree(key->security); key->security = NULL; } /** * security_key_permission() - Check if a kernel key operation is allowed * @key_ref: key reference * @cred: credentials of actor requesting access * @need_perm: requested permissions * * See whether a specific operational right is granted to a process on a key. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { return call_int_hook(key_permission, key_ref, cred, need_perm); } /** * security_key_getsecurity() - Get the key's security label * @key: key * @buffer: security label buffer * * Get a textual representation of the security context attached to a key for * the purposes of honouring KEYCTL_GETSECURITY. This function allocates the * storage for the NUL-terminated string and the caller should free it. * * Return: Returns the length of @buffer (including terminating NUL) or -ve if * an error occurs. May also return 0 (and a NULL buffer pointer) if * there is no security label assigned to the key. */ int security_key_getsecurity(struct key *key, char **buffer) { *buffer = NULL; return call_int_hook(key_getsecurity, key, buffer); } /** * security_key_post_create_or_update() - Notification of key create or update * @keyring: keyring to which the key is linked to * @key: created or updated key * @payload: data used to instantiate or update the key * @payload_len: length of payload * @flags: key flags * @create: flag indicating whether the key was created or updated * * Notify the caller of a key creation or update. */ void security_key_post_create_or_update(struct key *keyring, struct key *key, const void *payload, size_t payload_len, unsigned long flags, bool create) { call_void_hook(key_post_create_or_update, keyring, key, payload, payload_len, flags, create); } #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT /** * security_audit_rule_init() - Allocate and init an LSM audit rule struct * @field: audit action * @op: rule operator * @rulestr: rule context * @lsmrule: receive buffer for audit rule struct * @gfp: GFP flag used for kmalloc * * Allocate and initialize an LSM audit rule structure. * * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of * an invalid rule. */ int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, gfp_t gfp) { return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp); } /** * security_audit_rule_known() - Check if an audit rule contains LSM fields * @krule: audit rule * * Specifies whether given @krule contains any fields related to the current * LSM. * * Return: Returns 1 in case of relation found, 0 otherwise. */ int security_audit_rule_known(struct audit_krule *krule) { return call_int_hook(audit_rule_known, krule); } /** * security_audit_rule_free() - Free an LSM audit rule struct * @lsmrule: audit rule struct * * Deallocate the LSM audit rule structure previously allocated by * audit_rule_init(). */ void security_audit_rule_free(void *lsmrule) { call_void_hook(audit_rule_free, lsmrule); } /** * security_audit_rule_match() - Check if a label matches an audit rule * @prop: security label * @field: LSM audit field * @op: matching operator * @lsmrule: audit rule * * Determine if given @secid matches a rule previously approved by * security_audit_rule_known(). * * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on * failure. */ int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *lsmrule) { return call_int_hook(audit_rule_match, prop, field, op, lsmrule); } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL /** * security_bpf() - Check if the bpf syscall operation is allowed * @cmd: command * @attr: bpf attribute * @size: size * @kernel: whether or not call originated from kernel * * Do a initial check for all bpf syscalls after the attribute is copied into * the kernel. The actual security module can implement their own rules to * check the specific cmd they need. * * Return: Returns 0 if permission is granted. */ int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { return call_int_hook(bpf, cmd, attr, size, kernel); } /** * security_bpf_map() - Check if access to a bpf map is allowed * @map: bpf map * @fmode: mode * * Do a check when the kernel generates and returns a file descriptor for eBPF * maps. * * Return: Returns 0 if permission is granted. */ int security_bpf_map(struct bpf_map *map, fmode_t fmode) { return call_int_hook(bpf_map, map, fmode); } /** * security_bpf_prog() - Check if access to a bpf program is allowed * @prog: bpf program * * Do a check when the kernel generates and returns a file descriptor for eBPF * programs. * * Return: Returns 0 if permission is granted. */ int security_bpf_prog(struct bpf_prog *prog) { return call_int_hook(bpf_prog, prog); } /** * security_bpf_map_create() - Check if BPF map creation is allowed * @map: BPF map object * @attr: BPF syscall attributes used to create BPF map * @token: BPF token used to grant user access * @kernel: whether or not call originated from kernel * * Do a check when the kernel creates a new BPF map. This is also the * point where LSM blob is allocated for LSMs that need them. * * Return: Returns 0 on success, error on failure. */ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token, bool kernel) { return call_int_hook(bpf_map_create, map, attr, token, kernel); } /** * security_bpf_prog_load() - Check if loading of BPF program is allowed * @prog: BPF program object * @attr: BPF syscall attributes used to create BPF program * @token: BPF token used to grant user access to BPF subsystem * @kernel: whether or not call originated from kernel * * Perform an access control check when the kernel loads a BPF program and * allocates associated BPF program object. This hook is also responsible for * allocating any required LSM state for the BPF program. * * Return: Returns 0 on success, error on failure. */ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { return call_int_hook(bpf_prog_load, prog, attr, token, kernel); } /** * security_bpf_token_create() - Check if creating of BPF token is allowed * @token: BPF token object * @attr: BPF syscall attributes used to create BPF token * @path: path pointing to BPF FS mount point from which BPF token is created * * Do a check when the kernel instantiates a new BPF token object from BPF FS * instance. This is also the point where LSM blob can be allocated for LSMs. * * Return: Returns 0 on success, error on failure. */ int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { return call_int_hook(bpf_token_create, token, attr, path); } /** * security_bpf_token_cmd() - Check if BPF token is allowed to delegate * requested BPF syscall command * @token: BPF token object * @cmd: BPF syscall command requested to be delegated by BPF token * * Do a check when the kernel decides whether provided BPF token should allow * delegation of requested BPF syscall command. * * Return: Returns 0 on success, error on failure. */ int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) { return call_int_hook(bpf_token_cmd, token, cmd); } /** * security_bpf_token_capable() - Check if BPF token is allowed to delegate * requested BPF-related capability * @token: BPF token object * @cap: capabilities requested to be delegated by BPF token * * Do a check when the kernel decides whether provided BPF token should allow * delegation of requested BPF-related capabilities. * * Return: Returns 0 on success, error on failure. */ int security_bpf_token_capable(const struct bpf_token *token, int cap) { return call_int_hook(bpf_token_capable, token, cap); } /** * security_bpf_map_free() - Free a bpf map's LSM blob * @map: bpf map * * Clean up the security information stored inside bpf map. */ void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free, map); } /** * security_bpf_prog_free() - Free a BPF program's LSM blob * @prog: BPF program struct * * Clean up the security information stored inside BPF program. */ void security_bpf_prog_free(struct bpf_prog *prog) { call_void_hook(bpf_prog_free, prog); } /** * security_bpf_token_free() - Free a BPF token's LSM blob * @token: BPF token struct * * Clean up the security information stored inside BPF token. */ void security_bpf_token_free(struct bpf_token *token) { call_void_hook(bpf_token_free, token); } #endif /* CONFIG_BPF_SYSCALL */ /** * security_locked_down() - Check if a kernel feature is allowed * @what: requested kernel feature * * Determine whether a kernel feature that potentially enables arbitrary code * execution in kernel space should be permitted. * * Return: Returns 0 if permission is granted. */ int security_locked_down(enum lockdown_reason what) { return call_int_hook(locked_down, what); } EXPORT_SYMBOL(security_locked_down); /** * security_bdev_alloc() - Allocate a block device LSM blob * @bdev: block device * * Allocate and attach a security structure to @bdev->bd_security. The * security field is initialized to NULL when the bdev structure is * allocated. * * Return: Return 0 if operation was successful. */ int security_bdev_alloc(struct block_device *bdev) { int rc = 0; rc = lsm_bdev_alloc(bdev); if (unlikely(rc)) return rc; rc = call_int_hook(bdev_alloc_security, bdev); if (unlikely(rc)) security_bdev_free(bdev); return rc; } EXPORT_SYMBOL(security_bdev_alloc); /** * security_bdev_free() - Free a block device's LSM blob * @bdev: block device * * Deallocate the bdev security structure and set @bdev->bd_security to NULL. */ void security_bdev_free(struct block_device *bdev) { if (!bdev->bd_security) return; call_void_hook(bdev_free_security, bdev); kfree(bdev->bd_security); bdev->bd_security = NULL; } EXPORT_SYMBOL(security_bdev_free); /** * security_bdev_setintegrity() - Set the device's integrity data * @bdev: block device * @type: type of integrity, e.g. hash digest, signature, etc * @value: the integrity value * @size: size of the integrity value * * Register a verified integrity measurement of a bdev with LSMs. * LSMs should free the previously saved data if @value is NULL. * Please note that the new hook should be invoked every time the security * information is updated to keep these data current. For example, in dm-verity, * if the mapping table is reloaded and configured to use a different dm-verity * target with a new roothash and signing information, the previously stored * data in the LSM blob will become obsolete. It is crucial to re-invoke the * hook to refresh these data and ensure they are up to date. This necessity * arises from the design of device-mapper, where a device-mapper device is * first created, and then targets are subsequently loaded into it. These * targets can be modified multiple times during the device's lifetime. * Therefore, while the LSM blob is allocated during the creation of the block * device, its actual contents are not initialized at this stage and can change * substantially over time. This includes alterations from data that the LSMs * 'trusts' to those they do not, making it essential to handle these changes * correctly. Failure to address this dynamic aspect could potentially allow * for bypassing LSM checks. * * Return: Returns 0 on success, negative values on failure. */ int security_bdev_setintegrity(struct block_device *bdev, enum lsm_integrity_type type, const void *value, size_t size) { return call_int_hook(bdev_setintegrity, bdev, type, value, size); } EXPORT_SYMBOL(security_bdev_setintegrity); #ifdef CONFIG_PERF_EVENTS /** * security_perf_event_open() - Check if a perf event open is allowed * @type: type of event * * Check whether the @type of perf_event_open syscall is allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_open(int type) { return call_int_hook(perf_event_open, type); } /** * security_perf_event_alloc() - Allocate a perf event LSM blob * @event: perf event * * Allocate and save perf_event security info. * * Return: Returns 0 on success, error on failure. */ int security_perf_event_alloc(struct perf_event *event) { int rc; rc = lsm_blob_alloc(&event->security, blob_sizes.lbs_perf_event, GFP_KERNEL); if (rc) return rc; rc = call_int_hook(perf_event_alloc, event); if (rc) { kfree(event->security); event->security = NULL; } return rc; } /** * security_perf_event_free() - Free a perf event LSM blob * @event: perf event * * Release (free) perf_event security info. */ void security_perf_event_free(struct perf_event *event) { kfree(event->security); event->security = NULL; } /** * security_perf_event_read() - Check if reading a perf event label is allowed * @event: perf event * * Read perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_read(struct perf_event *event) { return call_int_hook(perf_event_read, event); } /** * security_perf_event_write() - Check if writing a perf event label is allowed * @event: perf event * * Write perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_write(struct perf_event *event) { return call_int_hook(perf_event_write, event); } #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_IO_URING /** * security_uring_override_creds() - Check if overriding creds is allowed * @new: new credentials * * Check if the current task, executing an io_uring operation, is allowed to * override it's credentials with @new. * * Return: Returns 0 if permission is granted. */ int security_uring_override_creds(const struct cred *new) { return call_int_hook(uring_override_creds, new); } /** * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed * * Check whether the current task is allowed to spawn a io_uring polling thread * (IORING_SETUP_SQPOLL). * * Return: Returns 0 if permission is granted. */ int security_uring_sqpoll(void) { return call_int_hook(uring_sqpoll); } /** * security_uring_cmd() - Check if a io_uring passthrough command is allowed * @ioucmd: command * * Check whether the file_operations uring_cmd is allowed to run. * * Return: Returns 0 if permission is granted. */ int security_uring_cmd(struct io_uring_cmd *ioucmd) { return call_int_hook(uring_cmd, ioucmd); } /** * security_uring_allowed() - Check if io_uring_setup() is allowed * * Check whether the current task is allowed to call io_uring_setup(). * * Return: Returns 0 if permission is granted. */ int security_uring_allowed(void) { return call_int_hook(uring_allowed); } #endif /* CONFIG_IO_URING */ /** * security_initramfs_populated() - Notify LSMs that initramfs has been loaded * * Tells the LSMs the initramfs has been unpacked into the rootfs. */ void security_initramfs_populated(void) { call_void_hook(initramfs_populated); }
18 21 341 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BKEY_TYPES_H #define _BCACHEFS_BKEY_TYPES_H #include "bcachefs_format.h" /* * bkey_i - bkey with inline value * bkey_s - bkey with split value * bkey_s_c - bkey with split value, const */ #define bkey_p_next(_k) vstruct_next(_k) static inline struct bkey_i *bkey_next(struct bkey_i *k) { return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); } #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) static inline size_t bkey_val_bytes(const struct bkey *k) { return bkey_val_u64s(k) * sizeof(u64); } static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) { unsigned u64s = BKEY_U64s + val_u64s; BUG_ON(u64s > U8_MAX); k->u64s = u64s; } static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) { set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); } #define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) #define bkey_whiteout(_k) \ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) /* bkey with split value, const */ struct bkey_s_c { const struct bkey *k; const struct bch_val *v; }; /* bkey with split value */ struct bkey_s { union { struct { struct bkey *k; struct bch_val *v; }; struct bkey_s_c s_c; }; }; #define bkey_s_null ((struct bkey_s) { .k = NULL }) #define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) #define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) #define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) static inline struct bkey_s bkey_to_s(struct bkey *k) { return (struct bkey_s) { .k = k, .v = NULL }; } static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) { return (struct bkey_s_c) { .k = k, .v = NULL }; } static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) { return (struct bkey_s) { .k = &k->k, .v = &k->v }; } static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) { return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; } /* * For a given type of value (e.g. struct bch_extent), generates the types for * bkey + bch_extent - inline, split, split const - and also all the conversion * functions, which also check that the value is of the correct type. * * We use anonymous unions for upcasting - e.g. converting from e.g. a * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion * functions. */ #define x(name, ...) \ struct bkey_i_##name { \ union { \ struct bkey k; \ struct bkey_i k_i; \ }; \ struct bch_##name v; \ }; \ \ struct bkey_s_c_##name { \ union { \ struct { \ const struct bkey *k; \ const struct bch_##name *v; \ }; \ struct bkey_s_c s_c; \ }; \ }; \ \ struct bkey_s_##name { \ union { \ struct { \ struct bkey *k; \ struct bch_##name *v; \ }; \ struct bkey_s_c_##name c; \ struct bkey_s s; \ struct bkey_s_c s_c; \ }; \ }; \ \ static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ { \ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return container_of(&k->k, struct bkey_i_##name, k); \ } \ \ static inline const struct bkey_i_##name * \ bkey_i_to_##name##_c(const struct bkey_i *k) \ { \ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return container_of(&k->k, struct bkey_i_##name, k); \ } \ \ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ { \ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ return (struct bkey_s_##name) { \ .k = k.k, \ .v = container_of(k.v, struct bch_##name, v), \ }; \ } \ \ static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ { \ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ return (struct bkey_s_c_##name) { \ .k = k.k, \ .v = container_of(k.v, struct bch_##name, v), \ }; \ } \ \ static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ { \ return (struct bkey_s_##name) { \ .k = &k->k, \ .v = &k->v, \ }; \ } \ \ static inline struct bkey_s_c_##name \ name##_i_to_s_c(const struct bkey_i_##name *k) \ { \ return (struct bkey_s_c_##name) { \ .k = &k->k, \ .v = &k->v, \ }; \ } \ \ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ { \ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return (struct bkey_s_##name) { \ .k = &k->k, \ .v = container_of(&k->v, struct bch_##name, v), \ }; \ } \ \ static inline struct bkey_s_c_##name \ bkey_i_to_s_c_##name(const struct bkey_i *k) \ { \ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return (struct bkey_s_c_##name) { \ .k = &k->k, \ .v = container_of(&k->v, struct bch_##name, v), \ }; \ } \ \ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ { \ struct bkey_i_##name *k = \ container_of(&_k->k, struct bkey_i_##name, k); \ \ bkey_init(&k->k); \ memset(&k->v, 0, sizeof(k->v)); \ k->k.type = KEY_TYPE_##name; \ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ \ return k; \ } BCH_BKEY_TYPES(); #undef x enum bch_validate_flags { BCH_VALIDATE_write = BIT(0), BCH_VALIDATE_commit = BIT(1), BCH_VALIDATE_silent = BIT(2), }; #define BKEY_VALIDATE_CONTEXTS() \ x(unknown) \ x(superblock) \ x(journal) \ x(btree_root) \ x(btree_node) \ x(commit) struct bkey_validate_context { enum { #define x(n) BKEY_VALIDATE_##n, BKEY_VALIDATE_CONTEXTS() #undef x } from:8; enum bch_validate_flags flags:8; u8 level; enum btree_id btree; bool root:1; unsigned journal_offset; u64 journal_seq; }; #endif /* _BCACHEFS_BKEY_TYPES_H */
8 8 7 1 7 4 4 5 5 5 5 5 2 5 5 5 4 3 4 4 3 4 4 4 4 4 4 4 3 3 3 3 3 7 5 2 4 3 7 6 7 7 6 1 1 15 5 2 10 11 11 1 2 8 3 3 7 8 2 6 4 4 7 8 18 2 1 3 12 1 14 1 3 9 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 // SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2007 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * */ #include <linux/slab.h> #include "ext4_jbd2.h" #include "ext4_extents.h" /* * The contiguous blocks details which can be * represented by a single extent */ struct migrate_struct { ext4_lblk_t first_block, last_block, curr_block; ext4_fsblk_t first_pblock, last_pblock; }; static int finish_range(handle_t *handle, struct inode *inode, struct migrate_struct *lb) { int retval = 0, needed; struct ext4_extent newext; struct ext4_ext_path *path; if (lb->first_pblock == 0) return 0; /* Add the extent to temp inode*/ newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); /* Locking only for convenience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); goto err_out; } /* * Calculate the credit needed to inserting this extent * Since we are doing this in loop we may accumulate extra * credit. But below we try to not accumulate too much * of them by restarting the journal. */ needed = ext4_ext_calc_credits_for_single_extent(inode, lb->last_block - lb->first_block + 1, path); retval = ext4_datasem_ensure_credits(handle, inode, needed, needed, 0); if (retval < 0) goto err_out; path = ext4_ext_insert_extent(handle, inode, path, &newext, 0); if (IS_ERR(path)) retval = PTR_ERR(path); err_out: up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); lb->first_pblock = 0; return retval; } static int update_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { int retval; /* * See if we can add on to the existing range (if it exists) */ if (lb->first_pblock && (lb->last_pblock+1 == pblock) && (lb->last_block+1 == lb->curr_block)) { lb->last_pblock = pblock; lb->last_block = lb->curr_block; lb->curr_block++; return 0; } /* * Start a new range. */ retval = finish_range(handle, inode, lb); lb->first_pblock = lb->last_pblock = pblock; lb->first_block = lb->last_block = lb->curr_block; lb->curr_block++; return retval; } static int update_ind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { lb->curr_block++; } } put_bh(bh); return retval; } static int update_dind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_ind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries; } } put_bh(bh); return retval; } static int update_tind_extent_range(handle_t *handle, struct inode *inode, ext4_fsblk_t pblock, struct migrate_struct *lb) { struct buffer_head *bh; __le32 *i_data; int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, pblock, 0); if (IS_ERR(bh)) return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (i_data[i]) { retval = update_dind_extent_range(handle, inode, le32_to_cpu(i_data[i]), lb); if (retval) break; } else { /* Only update the file block number */ lb->curr_block += max_entries * max_entries; } } put_bh(bh); return retval; } static int free_dind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i; __le32 *tmp_idata; struct buffer_head *bh; struct super_block *sb = inode->i_sb; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; int err; bh = ext4_sb_bread(sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) { put_bh(bh); return err; } ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } } put_bh(bh); err = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(sb, 1)); if (err < 0) return err; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_tind_blocks(handle_t *handle, struct inode *inode, __le32 i_data) { int i, retval = 0; __le32 *tmp_idata; struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); if (IS_ERR(bh)) return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { retval = free_dind_blocks(handle, inode, tmp_idata[i]); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) { int retval; /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } /* ei->i_data[EXT4_DIND_BLOCK] */ if (i_data[1]) { retval = free_dind_blocks(handle, inode, i_data[1]); if (retval) return retval; } /* ei->i_data[EXT4_TIND_BLOCK] */ if (i_data[2]) { retval = free_tind_blocks(handle, inode, i_data[2]); if (retval) return retval; } return 0; } static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, struct inode *tmp_inode) { int retval, retval2 = 0; __le32 i_data[3]; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); /* * One credit accounted for writing the * i_data field of the original inode */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto err_out; i_data[0] = ei->i_data[EXT4_IND_BLOCK]; i_data[1] = ei->i_data[EXT4_DIND_BLOCK]; i_data[2] = ei->i_data[EXT4_TIND_BLOCK]; down_write(&EXT4_I(inode)->i_data_sem); /* * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation * happened after we started the migrate. We need to * fail the migrate */ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { retval = -EAGAIN; up_write(&EXT4_I(inode)->i_data_sem); goto err_out; } else ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); /* * We have the extent map build with the tmp inode. * Now copy the i_data across */ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* * Update i_blocks with the new blocks that got * allocated while adding extents for extent index * blocks. * * While converting to extents we need not * update the original inode i_blocks for extent blocks * via quota APIs. The quota update happened via tmp_inode already. */ spin_lock(&inode->i_lock); inode->i_blocks += tmp_inode->i_blocks; spin_unlock(&inode->i_lock); up_write(&EXT4_I(inode)->i_data_sem); /* * We mark the inode dirty after, because we decrement the * i_blocks when freeing the indirect meta-data blocks */ retval = free_ind_block(handle, inode, i_data); retval2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(retval2 && !retval)) retval = retval2; err_out: return retval; } static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_idx *ix) { int i, retval = 0; ext4_fsblk_t block; struct buffer_head *bh; struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); bh = ext4_sb_bread(inode->i_sb, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) { put_bh(bh); return retval; } } } put_bh(bh); retval = ext4_journal_ensure_credits(handle, EXT4_RESERVE_TRANS_BLOCKS, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (retval < 0) return retval; ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; } /* * Free the extent meta data blocks only */ static int free_ext_block(handle_t *handle, struct inode *inode) { int i, retval = 0; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent_header *eh = (struct ext4_extent_header *)ei->i_data; struct ext4_extent_idx *ix; if (eh->eh_depth == 0) /* * No extra blocks allocated for extent meta data */ return 0; ix = EXT_FIRST_INDEX(eh); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ix++) { retval = free_ext_idx(handle, inode, ix); if (retval) return retval; } return retval; } int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; __le32 *i_data; struct ext4_inode_info *ei; struct inode *tmp_inode = NULL; struct migrate_struct lb; unsigned long max_entries; __u32 goal, tmp_csum_seed; uid_t owner[2]; int alloc_ctx; /* * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ if (!ext4_has_feature_extents(inode->i_sb) || ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || ext4_has_inline_data(inode)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) /* * don't migrate fast symlink */ return retval; alloc_ctx = ext4_writepages_down_write(inode->i_sb); /* * Worst case we can touch the allocation bitmaps and a block * group descriptor block. We do need to worry about * credits for modifying the quota inode. */ handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_unlock; } goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), S_IFREG, NULL, goal, owner, 0); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); ext4_journal_stop(handle); goto out_unlock; } /* * Use the correct seed for checksum (i.e. the seed from 'inode'). This * is so that the metadata blocks will have the correct checksum after * the migration. */ ei = EXT4_I(inode); tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed; EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* * Set the i_nlink to zero so it will be deleted later * when we drop inode reference. */ clear_nlink(tmp_inode); ext4_ext_tree_init(handle, tmp_inode); ext4_journal_stop(handle); /* * start with one credit accounted for * superblock modification. * * For the tmp_inode we already have committed the * transaction that created the inode. Later as and * when we add extents we extent the journal */ /* * Even though we take i_rwsem we can still cause block * allocation via mmap write to holes. If we have allocated * new blocks we fail migrate. New block allocation will * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated * with i_data_sem held to prevent racing with block * allocation. */ down_read(&EXT4_I(inode)->i_data_sem); ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); up_read((&EXT4_I(inode)->i_data_sem)); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); goto out_tmp_inode; } i_data = ei->i_data; memset(&lb, 0, sizeof(lb)); /* 32 bit block address 4 bytes */ max_entries = inode->i_sb->s_blocksize >> 2; for (i = 0; i < EXT4_NDIR_BLOCKS; i++) { if (i_data[i]) { retval = update_extent_range(handle, tmp_inode, le32_to_cpu(i_data[i]), &lb); if (retval) goto err_out; } else lb.curr_block++; } if (i_data[EXT4_IND_BLOCK]) { retval = update_ind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_IND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries; if (i_data[EXT4_DIND_BLOCK]) { retval = update_dind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_DIND_BLOCK]), &lb); if (retval) goto err_out; } else lb.curr_block += max_entries * max_entries; if (i_data[EXT4_TIND_BLOCK]) { retval = update_tind_extent_range(handle, tmp_inode, le32_to_cpu(i_data[EXT4_TIND_BLOCK]), &lb); if (retval) goto err_out; } /* * Build the last extent */ retval = finish_range(handle, tmp_inode, &lb); err_out: if (retval) /* * Failure case delete the extent information with the * tmp_inode */ free_ext_block(handle, tmp_inode); else { retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); if (retval) /* * if we fail to swap inode data free the extent * details of the tmp inode */ free_ext_block(handle, tmp_inode); } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ retval = ext4_journal_ensure_credits(handle, 1, 0); if (retval < 0) goto out_stop; /* * Mark the tmp_inode as of size zero */ i_size_write(tmp_inode, 0); /* * set the i_blocks count to zero * so that the ext4_evict_inode() does the * right job * * We don't need to take the i_lock because * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed; /* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode); out_stop: ext4_journal_stop(handle); out_tmp_inode: unlock_new_inode(tmp_inode); iput(tmp_inode); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return retval; } /* * Migrate a simple extent-based inode to use the i_blocks[] array */ int ext4_ind_migrate(struct inode *inode) { struct ext4_extent_header *eh; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_super_block *es = sbi->s_es; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; int ret, ret2 = 0; int alloc_ctx; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; if (ext4_has_feature_bigalloc(inode->i_sb)) return -EOPNOTSUPP; /* * In order to get correct extent info, force all delayed allocation * blocks to be allocated, otherwise delayed allocation blocks may not * be reflected and bypass the checks on extent header. */ if (test_opt(inode->i_sb, DELALLOC)) ext4_alloc_da_blocks(inode); alloc_ctx = ext4_writepages_down_write(inode->i_sb); handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_unlock; } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_check_inode(inode); if (ret) goto errout; eh = ext_inode_hdr(inode); ex = EXT_FIRST_EXTENT(eh); if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS || eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) { ret = -EOPNOTSUPP; goto errout; } if (eh->eh_entries == 0) blk = len = start = end = 0; else { len = le16_to_cpu(ex->ee_len); blk = ext4_ext_pblock(ex); start = le32_to_cpu(ex->ee_block); end = start + len - 1; if (end >= EXT4_NDIR_BLOCKS) { ret = -EOPNOTSUPP; goto errout; } } ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2 && !ret)) ret = ret2; errout: up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); out_unlock: ext4_writepages_up_write(inode->i_sb, alloc_ctx); return ret; }
4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2008-2010, 2013 Dave Chinner * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ialloc.h" #include "xfs_trace.h" struct kmem_cache *xfs_icreate_cache; /* inode create item */ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_icreate_item, ic_item); } /* * This returns the number of iovecs needed to log the given inode item. * * We only need one iovec for the icreate log structure. */ STATIC void xfs_icreate_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { *nvecs += 1; *nbytes += sizeof(struct xfs_icreate_log); } /* * This is called to fill in the vector of log iovecs for the * given inode create log item. */ STATIC void xfs_icreate_item_format( struct xfs_log_item *lip, struct xfs_log_vec *lv) { struct xfs_icreate_item *icp = ICR_ITEM(lip); struct xfs_log_iovec *vecp = NULL; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, &icp->ic_format, sizeof(struct xfs_icreate_log)); } STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } static const struct xfs_item_ops xfs_icreate_item_ops = { .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, .iop_release = xfs_icreate_item_release, }; /* * Initialize the inode log item for a newly allocated (in-core) inode. * * Inode extents can only reside within an AG. Hence specify the starting * block for the inode chunk by offset within an AG as well as the * length of the allocated extent. * * This joins the item to the transaction and marks it dirty so * that we don't need a separate call to do this, nor does the * caller need to know anything about the icreate item. */ void xfs_icreate_log( struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, unsigned int count, unsigned int inode_size, xfs_agblock_t length, unsigned int generation) { struct xfs_icreate_item *icp; icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); icp->ic_format.icl_type = XFS_LI_ICREATE; icp->ic_format.icl_size = 1; /* single vector */ icp->ic_format.icl_ag = cpu_to_be32(agno); icp->ic_format.icl_agbno = cpu_to_be32(agbno); icp->ic_format.icl_count = cpu_to_be32(count); icp->ic_format.icl_isize = cpu_to_be32(inode_size); icp->ic_format.icl_length = cpu_to_be32(length); icp->ic_format.icl_gen = cpu_to_be32(generation); xfs_trans_add_item(tp, &icp->ic_item); tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } static enum xlog_recover_reorder xlog_recover_icreate_reorder( struct xlog_recover_item *item) { /* * Inode allocation buffers must be replayed before subsequent inode * items try to modify those buffers. ICREATE items are the logical * equivalent of logging a newly initialized inode buffer, so recover * these at the same time that we recover logged buffers. */ return XLOG_REORDER_BUFFER_LIST; } /* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that * match the range to be initialised, stamped with inode templates and written * by delayed write so that subsequent modifications will hit the cached buffer * and only need writing out at the end of recovery. */ STATIC int xlog_recover_icreate_commit_pass2( struct xlog *log, struct list_head *buffer_list, struct xlog_recover_item *item, xfs_lsn_t lsn) { struct xfs_mount *mp = log->l_mp; struct xfs_icreate_log *icl; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agnumber_t agno; xfs_agblock_t agbno; unsigned int count; unsigned int isize; xfs_agblock_t length; int bb_per_cluster; int cancel_count; int nbufs; int i; icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; if (icl->icl_type != XFS_LI_ICREATE) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); return -EINVAL; } if (icl->icl_size != 1) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); return -EINVAL; } agno = be32_to_cpu(icl->icl_ag); if (agno >= mp->m_sb.sb_agcount) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); return -EINVAL; } agbno = be32_to_cpu(icl->icl_agbno); if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); return -EINVAL; } isize = be32_to_cpu(icl->icl_isize); if (isize != mp->m_sb.sb_inodesize) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); return -EINVAL; } count = be32_to_cpu(icl->icl_count); if (!count) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); return -EINVAL; } length = be32_to_cpu(icl->icl_length); if (!length || length >= mp->m_sb.sb_agblocks) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); return -EINVAL; } /* * The inode chunk is either full or sparse and we only support * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. */ if (length != igeo->ialloc_blks && length != igeo->ialloc_min_blks) { xfs_warn(log->l_mp, "%s: unsupported chunk length", __func__); return -EINVAL; } /* verify inode count is consistent with extent length */ if ((count >> mp->m_sb.sb_inopblog) != length) { xfs_warn(log->l_mp, "%s: inconsistent inode count and chunk length", __func__); return -EINVAL; } /* * The icreate transaction can cover multiple cluster buffers and these * buffers could have been freed and reused. Check the individual * buffers for cancellation so we don't overwrite anything written after * a cancellation. */ bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); nbufs = length / igeo->blocks_per_cluster; for (i = 0, cancel_count = 0; i < nbufs; i++) { xfs_daddr_t daddr; daddr = XFS_AGB_TO_DADDR(mp, agno, agbno + i * igeo->blocks_per_cluster); if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster)) cancel_count++; } /* * We currently only use icreate for a single allocation at a time. This * means we should expect either all or none of the buffers to be * cancelled. Be conservative and skip replay if at least one buffer is * cancelled, but warn the user that something is awry if the buffers * are not consistent. * * XXX: This must be refined to only skip cancelled clusters once we use * icreate for multiple chunk allocations. */ ASSERT(!cancel_count || cancel_count == nbufs); if (cancel_count) { if (cancel_count != nbufs) xfs_warn(mp, "WARNING: partial inode chunk cancellation, skipped icreate."); trace_xfs_log_recover_icreate_cancel(log, icl); return 0; } trace_xfs_log_recover_icreate_recover(log, icl); return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, be32_to_cpu(icl->icl_gen)); } const struct xlog_recover_item_ops xlog_icreate_item_ops = { .item_type = XFS_LI_ICREATE, .reorder = xlog_recover_icreate_reorder, .commit_pass2 = xlog_recover_icreate_commit_pass2, };
32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Socket Filter Data Structures */ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ #include <linux/atomic.h> #include <linux/bpf.h> #include <linux/refcount.h> #include <linux/compat.h> #include <linux/skbuff.h> #include <linux/linkage.h> #include <linux/printk.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/capability.h> #include <linux/set_memory.h> #include <linux/kallsyms.h> #include <linux/if_vlan.h> #include <linux/vmalloc.h> #include <linux/sockptr.h> #include <crypto/sha1.h> #include <linux/u64_stats_sync.h> #include <net/sch_generic.h> #include <asm/byteorder.h> #include <uapi/linux/filter.h> struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; struct ctl_table; struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. */ #define BPF_REG_ARG1 BPF_REG_1 #define BPF_REG_ARG2 BPF_REG_2 #define BPF_REG_ARG3 BPF_REG_3 #define BPF_REG_ARG4 BPF_REG_4 #define BPF_REG_ARG5 BPF_REG_5 #define BPF_REG_CTX BPF_REG_6 #define BPF_REG_FP BPF_REG_10 /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 #define BPF_REG_TMP BPF_REG_2 /* scratch reg */ #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 /* unused opcode to mark special load instruction. Same as BPF_ABS */ #define BPF_PROBE_MEM 0x20 /* unused opcode to mark special ldsx instruction. Same as BPF_IND */ #define BPF_PROBE_MEMSX 0x40 /* unused opcode to mark special load instruction. Same as BPF_MSH */ #define BPF_PROBE_MEM32 0xa0 /* unused opcode to mark special atomic instruction */ #define BPF_PROBE_ATOMIC 0xe0 /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 /* unused opcode to mark speculation barrier for mitigating * Speculative Store Bypass */ #define BPF_NOSPEC 0xc0 /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. */ #define BPF_SYM_ELF_TYPE 't' /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ #define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU64_REG(OP, DST, SRC) \ BPF_ALU64_REG_OFF(OP, DST, SRC, 0) #define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU32_REG(OP, DST, SRC) \ BPF_ALU32_REG_OFF(OP, DST, SRC, 0) /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ #define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU64_IMM(OP, DST, IMM) \ BPF_ALU64_IMM_OFF(OP, DST, IMM, 0) #define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU32_IMM(OP, DST, IMM) \ BPF_ALU32_IMM_OFF(OP, DST, IMM, 0) /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */ #define BPF_ENDIAN(TYPE, DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_END | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Byte Swap, bswap16/32/64 */ #define BPF_BSWAP(DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Short form of mov, dst_reg = src_reg */ #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV32_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) /* Special (internal-only) form of mov, used to resolve per-CPU addrs: * dst_reg = src_reg + <percpu_base_off> * BPF_ADDR_PERCPU is used as a special insn->off value. */ #define BPF_ADDR_PERCPU (-1) #define BPF_MOV64_PERCPU_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = BPF_ADDR_PERCPU, \ .imm = 0 }) static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; } /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ #define BPF_MOVSX64_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_MOVSX32_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Special form of mov32, used for doing explicit zero extension on dst. */ #define BPF_ZEXT_REG(DST) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = DST, \ .off = 0, \ .imm = 1 }) static inline bool insn_is_zext(const struct bpf_insn *insn) { return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; } /* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers * to pointers in user vma. */ static inline bool insn_is_cast_user(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1U << 16; } /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = (__u32) (IMM) }), \ ((struct bpf_insn) { \ .code = 0, /* zero is reserved opcode */ \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ #define BPF_LD_ABS(SIZE, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */ #define BPF_LD_IND(SIZE, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_IND, \ .dst_reg = 0, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory load, dst_reg = *(signed size *) (src_reg + off16) */ #define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_AND *(uint *) (dst_reg + off16) &= src_reg * BPF_OR *(uint *) (dst_reg + off16) |= src_reg * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) * BPF_LOAD_ACQ dst_reg = smp_load_acquire(src_reg + off16) * BPF_STORE_REL smp_store_release(dst_reg + off16, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = OP }) /* Legacy alias */ #define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ #define BPF_JMP_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_JA, \ .dst_reg = 0, \ .src_reg = 0, \ .off = OFF, \ .imm = 0 }) /* Unconditional jumps, gotol pc + imm32 */ #define BPF_JMP32_A(IMM) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_JA, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Relative call */ #define BPF_CALL_REL(TGT) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = BPF_PSEUDO_CALL, \ .off = 0, \ .imm = TGT }) /* Convert function address to BPF immediate */ #define BPF_CALL_IMM(x) ((void *)(x) - (void *)__bpf_call_base) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = BPF_CALL_IMM(FUNC) }) /* Kfunc call */ #define BPF_CALL_KFUNC(OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = BPF_PSEUDO_KFUNC_CALL, \ .off = OFF, \ .imm = IMM }) /* Raw code statement block */ #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ ((struct bpf_insn) { \ .code = CODE, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = IMM }) /* Program exit */ #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Speculation barrier */ #define BPF_ST_NOSPEC() \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_NOSPEC, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ ((struct sock_filter) BPF_STMT(CODE, K)) #define __BPF_JUMP(CODE, K, JT, JF) \ ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF)) #define bytes_to_bpf_size(bytes) \ ({ \ int bpf_size = -EINVAL; \ \ if (bytes == sizeof(u8)) \ bpf_size = BPF_B; \ else if (bytes == sizeof(u16)) \ bpf_size = BPF_H; \ else if (bytes == sizeof(u32)) \ bpf_size = BPF_W; \ else if (bytes == sizeof(u64)) \ bpf_size = BPF_DW; \ \ bpf_size; \ }) #define bpf_size_to_bytes(bpf_size) \ ({ \ int bytes = -EINVAL; \ \ if (bpf_size == BPF_B) \ bytes = sizeof(u8); \ else if (bpf_size == BPF_H) \ bytes = sizeof(u16); \ else if (bpf_size == BPF_W) \ bytes = sizeof(u32); \ else if (bpf_size == BPF_DW) \ bytes = sizeof(u64); \ \ bytes; \ }) #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_FIELD_SIZEOF(type, field) \ ({ \ const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_LDST_BYTES(insn) \ ({ \ const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ WARN_ON(__size < 0); \ __size; \ }) #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) #define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) #define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) #define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) #define __BPF_REG_0(...) __BPF_PAD(5) #define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) #define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) #define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) #define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) #define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) #define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) #define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) #define __BPF_CAST(t, a) \ (__force t) \ (__force \ typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ (unsigned long)0, (t)0))) a #define __BPF_V void #define __BPF_N #define __BPF_DECL_ARGS(t, a) t a #define __BPF_DECL_REGS(t, a) u64 a #define __BPF_PAD(n) \ __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) #define BPF_CALL_x(x, attr, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define __NOATTR #define BPF_CALL_0(name, ...) BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_2(name, ...) BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_3(name, ...) BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_4(name, ...) BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__) #define NOTRACE_BPF_CALL_1(name, ...) BPF_CALL_x(1, notrace, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 #if BITS_PER_LONG == 64 # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #else # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 #endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE)); \ *(PTR_SIZE) = (SIZE); \ offsetof(TYPE, MEMBER); \ }) /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { u16 len; compat_uptr_t filter; /* struct sock_filter * */ }; struct sock_fprog_kern { u16 len; struct sock_filter *filter; }; /* Some arches need doubleword alignment for their instructions and/or data */ #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog_stats { u64_stats_t cnt; u64_stats_t nsecs; u64_stats_t misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); struct bpf_timed_may_goto { u64 count; u64 timestamp; }; struct sk_filter { refcount_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)); static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc) { u32 ret; cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 duration, start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); duration = sched_clock() - start; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } return ret; } static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) { return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); } /* * Use in preemptible and therefore migratable context to make sure that * the execution of the BPF program runs on one CPU. * * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) { u32 ret; migrate_disable(); ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; void *data_meta; void *data_end; }; struct bpf_nh_params { u32 nh_family; union { u32 ipv4_nh; struct in6_addr ipv6_nh; }; }; /* flags for bpf_redirect_info kern_flags */ #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ #define BPF_RI_F_RI_INIT BIT(1) #define BPF_RI_F_CPU_MAP_INIT BIT(2) #define BPF_RI_F_DEV_MAP_INIT BIT(3) #define BPF_RI_F_XSK_MAP_INIT BIT(4) struct bpf_redirect_info { u64 tgt_index; void *tgt_value; struct bpf_map *map; u32 flags; u32 map_id; enum bpf_map_type map_type; struct bpf_nh_params nh; u32 kern_flags; }; struct bpf_net_context { struct bpf_redirect_info ri; struct list_head cpu_map_flush_list; struct list_head dev_map_flush_list; struct list_head xskmap_map_flush_list; }; static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx) { struct task_struct *tsk = current; if (tsk->bpf_net_context != NULL) return NULL; bpf_net_ctx->ri.kern_flags = 0; tsk->bpf_net_context = bpf_net_ctx; return bpf_net_ctx; } static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx) { if (bpf_net_ctx) current->bpf_net_context = NULL; } static inline struct bpf_net_context *bpf_net_ctx_get(void) { return current->bpf_net_context; } static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) { memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh)); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT; } return &bpf_net_ctx->ri; } static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) { INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT; } return &bpf_net_ctx->cpu_map_flush_list; } static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) { INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT; } return &bpf_net_ctx->dev_map_flush_list; } static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) { INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT; } return &bpf_net_ctx->xskmap_map_flush_list; } static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map, struct list_head **lh_dev, struct list_head **lh_xsk) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); u32 kern_flags = bpf_net_ctx->ri.kern_flags; struct list_head *lh; *lh_map = *lh_dev = *lh_xsk = NULL; if (!IS_ENABLED(CONFIG_BPF_SYSCALL)) return; lh = &bpf_net_ctx->dev_map_flush_list; if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh)) *lh_dev = lh; lh = &bpf_net_ctx->cpu_map_flush_list; if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh)) *lh_map = lh; lh = &bpf_net_ctx->xskmap_map_flush_list; if (IS_ENABLED(CONFIG_XDP_SOCKETS) && kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh)) *lh_xsk = lh; } /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) * ensure that cb[] area can be written to when BPF program is * invoked (otherwise cb[] save/restore is necessary). */ static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); cb->data_meta = skb->data - skb_metadata_len(skb); cb->data_end = skb->data + skb_headlen(skb); } /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ static inline void bpf_compute_and_save_data_end( struct sk_buff *skb, void **saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; *saved_data_end = cb->data_end; cb->data_end = skb->data + skb_headlen(skb); } /* Restore data saved by bpf_compute_and_save_data_end(). */ static inline void bpf_restore_data_end( struct sk_buff *skb, void *saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; cb->data_end = saved_data_end; } static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with * tc, that scratch memory is mapped to qdisc_skb_cb's data area. * * In some socket filter cases, the cb unfortunately needs to be * saved/restored so that protocol specific skb->cb[] data won't * be lost. In any case, due to unpriviledged eBPF programs * attached to sockets, we need to clear the bpf_skb_cb() area * to not leak previous contents to user space. */ BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN); BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != sizeof_field(struct qdisc_skb_cb, data)); return qdisc_skb_cb(skb)->data; } /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, const void *ctx) { const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; if (unlikely(prog->cb_access)) { memcpy(cb_saved, cb_data, sizeof(cb_saved)); memset(cb_data, 0, sizeof(cb_saved)); } res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res; } static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u32 res; migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); migrate_enable(); return res; } static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); u32 res; if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } DECLARE_BPF_DISPATCHER(xdp) DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp); void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) { return prog->len * sizeof(struct bpf_insn); } static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) { return round_up(bpf_prog_insn_size(prog) + sizeof(__be64) + 1, SHA1_BLOCK_SIZE); } static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), offsetof(struct bpf_prog, insns[proglen])); } static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) { /* When classic BPF programs have been loaded and the arch * does not have a classic BPF JIT (anymore), they have been * converted via bpf_migrate_filter() to eBPF and thus always * have an unspec program type. */ return prog->type == BPF_PROG_TYPE_UNSPEC; } static inline u32 bpf_ctx_off_adjust_machine(u32 size) { const u32 size_machine = sizeof(unsigned long); if (size > size_machine && size % size_machine == 0) size = size_machine; return size; } static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { return size <= size_default && (size & (size - 1)) == 0; } static inline u8 bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) { u8 access_off = off & (size_default - 1); #ifdef __LITTLE_ENDIAN return access_off; #else return size_default - (access_off + size); #endif } #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ off % sizeof(__u64) == 0) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { set_vm_flush_reset_perms(fp); return set_memory_ro((unsigned long)fp, fp->pages); } #endif return 0; } static inline int __must_check bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { return sk_filter_trim_cap(sk, skb, 1); } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); bool bpf_opcode_in_insntable(u8 code); void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); void bpf_prog_jit_attempt_done(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { __bpf_prog_free(fp); } typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, unsigned int flen); int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_inlines_helper_call(s32 imm); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); bool bpf_jit_supports_timed_may_goto(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); u64 arch_bpf_timed_may_goto(void); u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *); bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_set_return_frame_no_direct(void) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_clear_return_frame_no_direct(void) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; } static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { unsigned int len; if (unlikely(!(fwd->flags & IFF_UP))) return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; if (pktlen > len) return -EMSGSIZE; return 0; } /* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, const struct bpf_prog *prog); void xdp_do_flush(void); void bpf_warn_invalid_xdp_action(const struct net_device *dev, const struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { return NULL; } #endif #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_jit_binary_free(struct bpf_binary_header *hdr); u64 bpf_jit_alloc_exec_limit(void); void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || fp->aux->ksym.lnode.prev == LIST_POISON2; } struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, unsigned int alignment, struct bpf_binary_header **rw_hdr, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns); int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, 16, 1, image, proglen, false); } static inline bool bpf_jit_is_ebpf(void) { # ifdef CONFIG_HAVE_EBPF_JIT return true; # else return false; # endif } static inline bool ebpf_jit_enabled(void) { return bpf_jit_enable && bpf_jit_is_ebpf(); } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return fp->jited && bpf_jit_is_ebpf(); } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to * bail out. */ if (!bpf_jit_is_ebpf()) return false; if (!prog->jit_requested) return false; if (!bpf_jit_harden) return false; if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) return false; return true; } static inline bool bpf_jit_kallsyms_enabled(void) { /* There are a couple of corner cases where kallsyms should * not be enabled f.e. on hardening. */ if (bpf_jit_harden) return false; if (!bpf_jit_kallsyms) return false; if (bpf_jit_kallsyms == 1) return true; return false; } int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { int ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; return ret; } void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); #else /* CONFIG_BPF_JIT */ static inline bool ebpf_jit_enabled(void) { return false; } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { return false; } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; } static inline int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { return -ENOTSUPP; } static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); } static inline bool bpf_jit_kallsyms_enabled(void) { return false; } static inline int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { return 0; } static inline bool is_bpf_text_address(unsigned long addr) { return false; } static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { return -ERANGE; } static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { return NULL; } static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { return 0; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); #define BPF_ANC BIT(15) static inline bool bpf_needs_clear_a(const struct sock_filter *first) { switch (first->code) { case BPF_RET | BPF_K: case BPF_LD | BPF_W | BPF_LEN: return false; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X) return true; return false; default: return true; } } static inline u16 bpf_anc_helper(const struct sock_filter *ftest) { BUG_ON(ftest->code & BPF_ANC); switch (ftest->code) { case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: #define BPF_ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ return BPF_ANC | SKF_AD_##CODE switch (ftest->k) { BPF_ANCILLARY(PROTOCOL); BPF_ANCILLARY(PKTTYPE); BPF_ANCILLARY(IFINDEX); BPF_ANCILLARY(NLATTR); BPF_ANCILLARY(NLATTR_NEST); BPF_ANCILLARY(MARK); BPF_ANCILLARY(QUEUE); BPF_ANCILLARY(HATYPE); BPF_ANCILLARY(RXHASH); BPF_ANCILLARY(CPU); BPF_ANCILLARY(ALU_XOR_X); BPF_ANCILLARY(VLAN_TAG); BPF_ANCILLARY(VLAN_TAG_PRESENT); BPF_ANCILLARY(PAY_OFFSET); BPF_ANCILLARY(RANDOM); BPF_ANCILLARY(VLAN_TPID); } fallthrough; default: return ftest->code; } } void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size); static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; } struct bpf_sock_addr_kern { struct sock *sk; struct sockaddr *uaddr; /* Temporary "register" to make indirect stores to nested structures * defined above. We need three registers to make such a store, but * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; void *t_ctx; /* Attach type specific context. */ u32 uaddrlen; }; struct bpf_sock_ops_kern { struct sock *sk; union { u32 args[4]; u32 reply; u32 replylong[4]; }; struct sk_buff *syn_skb; struct sk_buff *skb; void *skb_data_end; u8 op; u8 is_fullsock; u8 is_locked_tcp_sock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that * should be initialized to 0 should * be inserted before temp. * temp is scratch storage used by * sock_ops_convert_ctx_access * as temporary storage of a register. */ }; struct bpf_sysctl_kern { struct ctl_table_header *head; const struct ctl_table *table; void *cur_val; size_t cur_len; void *new_val; size_t new_len; int new_updated; int write; loff_t *ppos; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; #define BPF_SOCKOPT_KERN_BUF_SIZE 32 struct bpf_sockopt_buf { u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; }; struct bpf_sockopt_kern { struct sock *sk; u8 *optval; u8 *optval_end; s32 level; s32 optname; s32 optlen; /* for retval in struct bpf_cg_run_ctx */ struct task_struct *current_task; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; __be16 sport; u16 dport; struct { __be32 saddr; __be32 daddr; } v4; struct { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; struct sock *selected_sk; u32 ingress_ifindex; bool no_reuseport; }; extern struct static_key_false bpf_sk_lookup_enabled; /* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup. * * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and * SK_DROP. Their meaning is as follows: * * SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result * SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup * SK_DROP : terminate lookup with -ECONNREFUSED * * This macro aggregates return values and selected sockets from * multiple BPF programs according to following rules in order: * * 1. If any program returned SK_PASS and a non-NULL ctx.selected_sk, * macro result is SK_PASS and last ctx.selected_sk is used. * 2. If any program returned SK_DROP return value, * macro result is SK_DROP. * 3. Otherwise result is SK_PASS and ctx.selected_sk is NULL. * * Caller must ensure that the prog array is non-NULL, and that the * array as well as the programs it contains remain valid. */ #define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func) \ ({ \ struct bpf_sk_lookup_kern *_ctx = &(ctx); \ struct bpf_prog_array_item *_item; \ struct sock *_selected_sk = NULL; \ bool _no_reuseport = false; \ struct bpf_prog *_prog; \ bool _all_pass = true; \ u32 _ret; \ \ migrate_disable(); \ _item = &(array)->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ /* restore most recent selection */ \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ \ _ret = func(_prog, _ctx); \ if (_ret == SK_PASS && _ctx->selected_sk) { \ /* remember last non-NULL socket */ \ _selected_sk = _ctx->selected_sk; \ _no_reuseport = _ctx->no_reuseport; \ } else if (_ret == SK_DROP && _all_pass) { \ _all_pass = false; \ } \ _item++; \ } \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ migrate_enable(); \ _all_pass || _selected_sk ? SK_PASS : SK_DROP; \ }) static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET, .protocol = protocol, .v4.saddr = saddr, .v4.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #if IS_ENABLED(CONFIG_IPV6) static inline bool bpf_sk_lookup_run_v6(const struct net *net, int protocol, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET6, .protocol = protocol, .v6.saddr = saddr, .v6.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #endif /* IS_ENABLED(CONFIG_IPV6) */ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes * precedence. */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags & action_mask; } ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; if (flags & BPF_F_BROADCAST) { WRITE_ONCE(ri->map, map); ri->flags = flags; } else { WRITE_ONCE(ri->map, NULL); ri->flags = 0; } return XDP_REDIRECT; } #ifdef CONFIG_NET int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return -EOPNOTSUPP; } static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { return NULL; } static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { } #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */
203 21 1 20 49 41 19 38 38 32 28 38 38 9 2 2 2 2 13 10 1 2 34 190 190 190 113 76 17 39 39 39 5 5 1 3 1 34 2 4 11 18 35 33 2 33 2 33 2 34 1 24 3 21 18 6 14 10 7 17 5 19 109 71 25 9 8 43 10 11 25 179 180 181 181 106 96 14 192 1 84 36 1 3 76 4 9 1 27 28 1 11 4 1 12 10 9 9 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/inode.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Inode handling routines */ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/uio.h> #include <linux/fileattr.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" static int hfsplus_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, hfsplus_get_block); } static void hfsplus_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); hfsplus_file_truncate(inode); } } int hfsplus_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfsplus_get_block, &HFSPLUS_I(mapping->host)->phys_size); if (unlikely(ret)) hfsplus_write_failed(mapping, pos + len); return ret; } static sector_t hfsplus_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, hfsplus_get_block); } static bool hfsplus_release_folio(struct folio *folio, gfp_t mask) { struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct hfs_btree *tree; struct hfs_bnode *node; u32 nidx; int i; bool res = true; switch (inode->i_ino) { case HFSPLUS_EXT_CNID: tree = HFSPLUS_SB(sb)->ext_tree; break; case HFSPLUS_CAT_CNID: tree = HFSPLUS_SB(sb)->cat_tree; break; case HFSPLUS_ATTR_CNID: tree = HFSPLUS_SB(sb)->attr_tree; break; default: BUG(); return false; } if (!tree) return false; if (tree->node_size >= PAGE_SIZE) { nidx = folio->index >> (tree->node_size_shift - PAGE_SHIFT); spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, nidx); if (!node) ; else if (atomic_read(&node->refcnt)) res = false; if (res && node) { hfs_bnode_unhash(node); hfs_bnode_free(node); } spin_unlock(&tree->hash_lock); } else { nidx = folio->index << (PAGE_SHIFT - tree->node_size_shift); i = 1 << (PAGE_SHIFT - tree->node_size_shift); spin_lock(&tree->hash_lock); do { node = hfs_bnode_findhash(tree, nidx++); if (!node) continue; if (atomic_read(&node->refcnt)) { res = false; break; } hfs_bnode_unhash(node); hfs_bnode_free(node); } while (--i && nidx < tree->node_count); spin_unlock(&tree->hash_lock); } return res ? try_to_free_buffers(folio) : false; } static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; ret = blockdev_direct_IO(iocb, inode, iter, hfsplus_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = iocb->ki_pos + count; if (end > isize) hfsplus_write_failed(mapping, end); } return ret; } static int hfsplus_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, hfsplus_get_block); } const struct address_space_operations hfsplus_btree_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfsplus_read_folio, .writepages = hfsplus_writepages, .write_begin = hfsplus_write_begin, .write_end = generic_write_end, .migrate_folio = buffer_migrate_folio, .bmap = hfsplus_bmap, .release_folio = hfsplus_release_folio, }; const struct address_space_operations hfsplus_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfsplus_read_folio, .write_begin = hfsplus_write_begin, .write_end = generic_write_end, .bmap = hfsplus_bmap, .direct_IO = hfsplus_direct_IO, .writepages = hfsplus_writepages, .migrate_folio = buffer_migrate_folio, }; const struct dentry_operations hfsplus_dentry_operations = { .d_hash = hfsplus_hash_dentry, .d_compare = hfsplus_compare_dentry, }; static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); u16 mode; mode = be16_to_cpu(perms->mode); i_uid_write(inode, be32_to_cpu(perms->owner)); if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode)) inode->i_uid = sbi->uid; i_gid_write(inode, be32_to_cpu(perms->group)); if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode)) inode->i_gid = sbi->gid; if (dir) { mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask)); mode |= S_IFDIR; } else if (!mode) mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask)); inode->i_mode = mode; HFSPLUS_I(inode)->userflags = perms->userflags; if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; if (perms->rootflags & HFSPLUS_FLG_APPEND) inode->i_flags |= S_APPEND; else inode->i_flags &= ~S_APPEND; } static int hfsplus_file_open(struct inode *inode, struct file *file) { if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode)->rsrc_inode; if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EOVERFLOW; atomic_inc(&HFSPLUS_I(inode)->opencnt); return 0; } static int hfsplus_file_release(struct inode *inode, struct file *file) { struct super_block *sb = inode->i_sb; if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { inode_lock(inode); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } inode_unlock(inode); } return 0; } static int hfsplus_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) return error; if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); if (attr->ia_size > inode->i_size) { error = generic_cont_expand_simple(inode, attr->ia_size); if (error) return error; } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = hfsp_mt2ut(hip->create_date); } if (inode->i_flags & S_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (inode->i_flags & S_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (hip->userflags & HFSPLUS_FLG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct hfsplus_inode_info *hip = HFSPLUS_I(inode); struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); int error = 0, error2; error = file_write_and_wait_range(file, start, end); if (error) return error; inode_lock(inode); /* * Sync inode metadata into the catalog and extent trees. */ sync_inode_metadata(inode, 1); /* * And explicitly write out the btrees. */ if (test_and_clear_bit(HFSPLUS_I_CAT_DIRTY, &hip->flags)) error = filemap_write_and_wait(sbi->cat_tree->inode->i_mapping); if (test_and_clear_bit(HFSPLUS_I_EXT_DIRTY, &hip->flags)) { error2 = filemap_write_and_wait(sbi->ext_tree->inode->i_mapping); if (!error) error = error2; } if (test_and_clear_bit(HFSPLUS_I_ATTR_DIRTY, &hip->flags)) { if (sbi->attr_tree) { error2 = filemap_write_and_wait( sbi->attr_tree->inode->i_mapping); if (!error) error = error2; } else { pr_err("sync non-existent attributes tree\n"); } } if (test_and_clear_bit(HFSPLUS_I_ALLOC_DIRTY, &hip->flags)) { error2 = filemap_write_and_wait(sbi->alloc_file->i_mapping); if (!error) error = error2; } if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev); inode_unlock(inode); return error; } static const struct inode_operations hfsplus_file_inode_operations = { .setattr = hfsplus_setattr, .getattr = hfsplus_getattr, .listxattr = hfsplus_listxattr, .fileattr_get = hfsplus_fileattr_get, .fileattr_set = hfsplus_fileattr_set, }; static const struct file_operations hfsplus_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = filemap_splice_read, .fsync = hfsplus_file_fsync, .open = hfsplus_file_open, .release = hfsplus_file_release, .unlocked_ioctl = hfsplus_ioctl, }; struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, umode_t mode) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *inode = new_inode(sb); struct hfsplus_inode_info *hip; if (!inode) return NULL; inode->i_ino = sbi->next_cnid++; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); simple_inode_init_ts(inode); hip = HFSPLUS_I(inode); INIT_LIST_HEAD(&hip->open_dir_list); spin_lock_init(&hip->open_dir_lock); mutex_init(&hip->extents_lock); atomic_set(&hip->opencnt, 0); hip->extent_state = 0; hip->flags = 0; hip->userflags = 0; hip->subfolders = 0; memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec)); memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); hip->alloc_blocks = 0; hip->first_blocks = 0; hip->cached_start = 0; hip->cached_blocks = 0; hip->phys_size = 0; hip->fs_blocks = 0; hip->rsrc_inode = NULL; if (S_ISDIR(inode->i_mode)) { inode->i_size = 2; sbi->folder_count++; inode->i_op = &hfsplus_dir_inode_operations; inode->i_fop = &hfsplus_dir_operations; } else if (S_ISREG(inode->i_mode)) { sbi->file_count++; inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; hip->clump_blocks = sbi->data_clump_blocks; } else if (S_ISLNK(inode->i_mode)) { sbi->file_count++; inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &hfsplus_aops; hip->clump_blocks = 1; } else sbi->file_count++; insert_inode_hash(inode); mark_inode_dirty(inode); hfsplus_mark_mdb_dirty(sb); return inode; } void hfsplus_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; if (S_ISDIR(inode->i_mode)) { HFSPLUS_SB(sb)->folder_count--; hfsplus_mark_mdb_dirty(sb); return; } HFSPLUS_SB(sb)->file_count--; if (S_ISREG(inode->i_mode)) { if (!inode->i_nlink) { inode->i_size = 0; hfsplus_file_truncate(inode); } } else if (S_ISLNK(inode->i_mode)) { inode->i_size = 0; hfsplus_file_truncate(inode); } hfsplus_mark_mdb_dirty(sb); } void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) { struct super_block *sb = inode->i_sb; struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); u32 count; int i; memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec)); for (count = 0, i = 0; i < 8; i++) count += be32_to_cpu(fork->extents[i].block_count); hip->first_blocks = count; memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec)); hip->cached_start = 0; hip->cached_blocks = 0; hip->alloc_blocks = be32_to_cpu(fork->total_blocks); hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size); hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits); hip->clump_blocks = be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift; if (!hip->clump_blocks) { hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ? sbi->rsrc_clump_blocks : sbi->data_clump_blocks; } } void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) { memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents, sizeof(hfsplus_extent_rec)); fork->total_size = cpu_to_be64(inode->i_size); fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks); } int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) { hfsplus_cat_entry entry; int res = 0; u16 type; type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); HFSPLUS_I(inode)->linkid = 0; if (type == HFSPLUS_FOLDER) { struct hfsplus_cat_folder *folder = &entry.folder; if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) { pr_err("bad catalog folder entry\n"); res = -EIO; goto out; } hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_get_perms(inode, &folder->permissions, 1); set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date)); inode_set_mtime_to_ts(inode, hfsp_mt2ut(folder->content_mod_date)); inode_set_ctime_to_ts(inode, hfsp_mt2ut(folder->attribute_mod_date)); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { HFSPLUS_I(inode)->subfolders = be32_to_cpu(folder->subfolders); } inode->i_op = &hfsplus_dir_inode_operations; inode->i_fop = &hfsplus_dir_operations; } else if (type == HFSPLUS_FILE) { struct hfsplus_cat_file *file = &entry.file; if (fd->entrylength < sizeof(struct hfsplus_cat_file)) { pr_err("bad catalog file entry\n"); res = -EIO; goto out; } hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_read_fork(inode, HFSPLUS_IS_RSRC(inode) ? &file->rsrc_fork : &file->data_fork); hfsplus_get_perms(inode, &file->permissions, 0); set_nlink(inode, 1); if (S_ISREG(inode->i_mode)) { if (file->permissions.dev) set_nlink(inode, be32_to_cpu(file->permissions.dev)); inode->i_op = &hfsplus_file_inode_operations; inode->i_fop = &hfsplus_file_operations; inode->i_mapping->a_ops = &hfsplus_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &hfsplus_aops; } else { init_special_inode(inode, inode->i_mode, be32_to_cpu(file->permissions.dev)); } inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date)); inode_set_mtime_to_ts(inode, hfsp_mt2ut(file->content_mod_date)); inode_set_ctime_to_ts(inode, hfsp_mt2ut(file->attribute_mod_date)); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); res = -EIO; } out: return res; } int hfsplus_cat_write_inode(struct inode *inode) { struct inode *main_inode = inode; struct hfs_find_data fd; hfsplus_cat_entry entry; int res = 0; if (HFSPLUS_IS_RSRC(inode)) main_inode = HFSPLUS_I(inode)->rsrc_inode; if (!main_inode->i_nlink) return 0; if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd)) /* panic? */ return -EIO; if (hfsplus_find_cat(main_inode->i_sb, main_inode->i_ino, &fd)) /* panic? */ goto out; if (S_ISDIR(main_inode->i_mode)) { struct hfsplus_cat_folder *folder = &entry.folder; if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) { pr_err("bad catalog folder entry\n"); res = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ hfsplus_cat_set_perms(inode, &folder->permissions); folder->access_date = hfsp_ut2mt(inode_get_atime(inode)); folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode)); folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); folder->valence = cpu_to_be32(inode->i_size - 2); if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { folder->subfolders = cpu_to_be32(HFSPLUS_I(inode)->subfolders); } hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); } else if (HFSPLUS_IS_RSRC(inode)) { struct hfsplus_cat_file *file = &entry.file; hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->rsrc_fork); hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); } else { struct hfsplus_cat_file *file = &entry.file; if (fd.entrylength < sizeof(struct hfsplus_cat_file)) { pr_err("bad catalog file entry\n"); res = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->data_fork); hfsplus_cat_set_perms(inode, &file->permissions); if (HFSPLUS_FLG_IMMUTABLE & (file->permissions.rootflags | file->permissions.userflags)) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); else file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); file->access_date = hfsp_ut2mt(inode_get_atime(inode)); file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode)); file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); } set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags); out: hfs_find_exit(&fd); return res; } int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); unsigned int flags = 0; if (inode->i_flags & S_IMMUTABLE) flags |= FS_IMMUTABLE_FL; if (inode->i_flags & S_APPEND) flags |= FS_APPEND_FL; if (hip->userflags & HFSPLUS_FLG_NODUMP) flags |= FS_NODUMP_FL; fileattr_fill_flags(fa, flags); return 0; } int hfsplus_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); unsigned int new_fl = 0; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; /* don't silently ignore unsupported ext2 flags */ if (fa->flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) return -EOPNOTSUPP; if (fa->flags & FS_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (fa->flags & FS_APPEND_FL) new_fl |= S_APPEND; inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); if (fa->flags & FS_NODUMP_FL) hip->userflags |= HFSPLUS_FLG_NODUMP; else hip->userflags &= ~HFSPLUS_FLG_NODUMP; inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 // SPDX-License-Identifier: GPL-2.0-only /* * File: pn_dev.c * * Phonet network device * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/phonet.h> #include <linux/proc_fs.h> #include <linux/if_arp.h> #include <net/sock.h> #include <net/netns/generic.h> #include <net/phonet/pn_dev.h> struct phonet_routes { spinlock_t lock; struct net_device __rcu *table[64]; }; struct phonet_net { struct phonet_device_list pndevs; struct phonet_routes routes; }; static unsigned int phonet_net_id __read_mostly; static struct phonet_net *phonet_pernet(struct net *net) { return net_generic(net, phonet_net_id); } struct phonet_device_list *phonet_device_list(struct net *net) { struct phonet_net *pnn = phonet_pernet(net); return &pnn->pndevs; } /* Allocate new Phonet device. */ static struct phonet_device *__phonet_device_alloc(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC); if (pnd == NULL) return NULL; pnd->netdev = dev; bitmap_zero(pnd->addrs, 64); lockdep_assert_held(&pndevs->lock); list_add_rcu(&pnd->list, &pndevs->list); return pnd; } static struct phonet_device *__phonet_get(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; lockdep_assert_held(&pndevs->lock); list_for_each_entry(pnd, &pndevs->list, list) { if (pnd->netdev == dev) return pnd; } return NULL; } static struct phonet_device *__phonet_get_rcu(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; list_for_each_entry_rcu(pnd, &pndevs->list, list) { if (pnd->netdev == dev) return pnd; } return NULL; } static void phonet_device_destroy(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; ASSERT_RTNL(); spin_lock(&pndevs->lock); pnd = __phonet_get(dev); if (pnd) list_del_rcu(&pnd->list); spin_unlock(&pndevs->lock); if (pnd) { struct net *net = dev_net(dev); u32 ifindex = dev->ifindex; u8 addr; for_each_set_bit(addr, pnd->addrs, 64) phonet_address_notify(net, RTM_DELADDR, ifindex, addr); kfree(pnd); } } struct net_device *phonet_device_get(struct net *net) { struct phonet_device_list *pndevs = phonet_device_list(net); struct phonet_device *pnd; struct net_device *dev = NULL; rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { dev = pnd->netdev; BUG_ON(!dev); if ((dev->reg_state == NETREG_REGISTERED) && ((pnd->netdev->flags & IFF_UP)) == IFF_UP) break; dev = NULL; } dev_hold(dev); rcu_read_unlock(); return dev; } int phonet_address_add(struct net_device *dev, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; int err = 0; spin_lock(&pndevs->lock); /* Find or create Phonet-specific device data */ pnd = __phonet_get(dev); if (pnd == NULL) pnd = __phonet_device_alloc(dev); if (unlikely(pnd == NULL)) err = -ENOMEM; else if (test_and_set_bit(addr >> 2, pnd->addrs)) err = -EEXIST; spin_unlock(&pndevs->lock); return err; } int phonet_address_del(struct net_device *dev, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; int err = 0; spin_lock(&pndevs->lock); pnd = __phonet_get(dev); if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) { err = -EADDRNOTAVAIL; pnd = NULL; } else if (bitmap_empty(pnd->addrs, 64)) list_del_rcu(&pnd->list); else pnd = NULL; spin_unlock(&pndevs->lock); if (pnd) kfree_rcu(pnd, rcu); return err; } /* Gets a source address toward a destination, through a interface. */ u8 phonet_address_get(struct net_device *dev, u8 daddr) { struct phonet_device *pnd; u8 saddr; rcu_read_lock(); pnd = __phonet_get_rcu(dev); if (pnd) { BUG_ON(bitmap_empty(pnd->addrs, 64)); /* Use same source address as destination, if possible */ if (test_bit(daddr >> 2, pnd->addrs)) saddr = daddr; else saddr = find_first_bit(pnd->addrs, 64) << 2; } else saddr = PN_NO_ADDR; rcu_read_unlock(); if (saddr == PN_NO_ADDR) { /* Fallback to another device */ struct net_device *def_dev; def_dev = phonet_device_get(dev_net(dev)); if (def_dev) { if (def_dev != dev) saddr = phonet_address_get(def_dev, daddr); dev_put(def_dev); } } return saddr; } int phonet_address_lookup(struct net *net, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(net); struct phonet_device *pnd; int err = -EADDRNOTAVAIL; rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { /* Don't allow unregistering devices! */ if ((pnd->netdev->reg_state != NETREG_REGISTERED) || ((pnd->netdev->flags & IFF_UP)) != IFF_UP) continue; if (test_bit(addr >> 2, pnd->addrs)) { err = 0; goto found; } } found: rcu_read_unlock(); return err; } /* automatically configure a Phonet device, if supported */ static int phonet_device_autoconf(struct net_device *dev) { struct if_phonet_req req; int ret; if (!dev->netdev_ops->ndo_siocdevprivate) return -EOPNOTSUPP; ret = dev->netdev_ops->ndo_siocdevprivate(dev, (struct ifreq *)&req, NULL, SIOCPNGAUTOCONF); if (ret < 0) return ret; ASSERT_RTNL(); ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device); if (ret) return ret; phonet_address_notify(dev_net(dev), RTM_NEWADDR, dev->ifindex, req.ifr_phonet_autoconf.device); return 0; } static void phonet_route_autodel(struct net_device *dev) { struct net *net = dev_net(dev); DECLARE_BITMAP(deleted, 64); u32 ifindex = dev->ifindex; struct phonet_net *pnn; unsigned int i; pnn = phonet_pernet(net); /* Remove left-over Phonet routes */ bitmap_zero(deleted, 64); spin_lock(&pnn->routes.lock); for (i = 0; i < 64; i++) { if (rcu_access_pointer(pnn->routes.table[i]) == dev) { RCU_INIT_POINTER(pnn->routes.table[i], NULL); set_bit(i, deleted); } } spin_unlock(&pnn->routes.lock); if (bitmap_empty(deleted, 64)) return; /* short-circuit RCU */ synchronize_rcu(); for_each_set_bit(i, deleted, 64) { rtm_phonet_notify(net, RTM_DELROUTE, ifindex, i); dev_put(dev); } } /* notify Phonet of device events */ static int phonet_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (what) { case NETDEV_REGISTER: if (dev->type == ARPHRD_PHONET) phonet_device_autoconf(dev); break; case NETDEV_UNREGISTER: phonet_device_destroy(dev); phonet_route_autodel(dev); break; } return 0; } static struct notifier_block phonet_device_notifier = { .notifier_call = phonet_device_notify, .priority = 0, }; /* Per-namespace Phonet devices handling */ static int __net_init phonet_init_net(struct net *net) { struct phonet_net *pnn = phonet_pernet(net); if (!proc_create_net("phonet", 0, net->proc_net, &pn_sock_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; INIT_LIST_HEAD(&pnn->pndevs.list); spin_lock_init(&pnn->pndevs.lock); spin_lock_init(&pnn->routes.lock); return 0; } static void __net_exit phonet_exit_net(struct net *net) { struct phonet_net *pnn = phonet_pernet(net); remove_proc_entry("phonet", net->proc_net); WARN_ON_ONCE(!list_empty(&pnn->pndevs.list)); } static struct pernet_operations phonet_net_ops = { .init = phonet_init_net, .exit = phonet_exit_net, .id = &phonet_net_id, .size = sizeof(struct phonet_net), }; /* Initialize Phonet devices list */ int __init phonet_device_init(void) { int err = register_pernet_subsys(&phonet_net_ops); if (err) return err; proc_create_net("pnresource", 0, init_net.proc_net, &pn_res_seq_ops, sizeof(struct seq_net_private)); register_netdevice_notifier(&phonet_device_notifier); err = phonet_netlink_register(); if (err) phonet_device_exit(); return err; } void phonet_device_exit(void) { rtnl_unregister_all(PF_PHONET); unregister_netdevice_notifier(&phonet_device_notifier); unregister_pernet_subsys(&phonet_net_ops); remove_proc_entry("pnresource", init_net.proc_net); } int phonet_route_add(struct net_device *dev, u8 daddr) { struct phonet_net *pnn = phonet_pernet(dev_net(dev)); struct phonet_routes *routes = &pnn->routes; int err = -EEXIST; daddr = daddr >> 2; spin_lock(&routes->lock); if (routes->table[daddr] == NULL) { rcu_assign_pointer(routes->table[daddr], dev); dev_hold(dev); err = 0; } spin_unlock(&routes->lock); return err; } int phonet_route_del(struct net_device *dev, u8 daddr) { struct phonet_net *pnn = phonet_pernet(dev_net(dev)); struct phonet_routes *routes = &pnn->routes; daddr = daddr >> 2; spin_lock(&routes->lock); if (rcu_access_pointer(routes->table[daddr]) == dev) RCU_INIT_POINTER(routes->table[daddr], NULL); else dev = NULL; spin_unlock(&routes->lock); if (!dev) return -ENOENT; /* Note : our caller must call synchronize_rcu() and dev_put(dev) */ return 0; } struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr) { struct phonet_net *pnn = phonet_pernet(net); struct phonet_routes *routes = &pnn->routes; struct net_device *dev; daddr >>= 2; dev = rcu_dereference(routes->table[daddr]); return dev; } struct net_device *phonet_route_output(struct net *net, u8 daddr) { struct phonet_net *pnn = phonet_pernet(net); struct phonet_routes *routes = &pnn->routes; struct net_device *dev; daddr >>= 2; rcu_read_lock(); dev = rcu_dereference(routes->table[daddr]); dev_hold(dev); rcu_read_unlock(); if (!dev) dev = phonet_device_get(net); /* Default route */ return dev; }
151 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_KSM_H #define __LINUX_KSM_H /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork(). */ #include <linux/bitops.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/sched.h> #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); void ksm_add_vma(struct vm_area_struct *vma); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); /* * To identify zeropages that were mapped by KSM, we reuse the dirty bit * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when * deduplicating memory. */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) extern atomic_long_t ksm_zero_pages; static inline void ksm_map_zero_page(struct mm_struct *mm) { atomic_long_inc(&ksm_zero_pages); atomic_long_inc(&mm->ksm_zero_pages); } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { atomic_long_dec(&ksm_zero_pages); atomic_long_dec(&mm->ksm_zero_pages); } } static inline long mm_ksm_zero_pages(struct mm_struct *mm) { return atomic_long_read(&mm->ksm_zero_pages); } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return __ksm_enter(mm); return 0; } static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); } /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, * it might be faulted into a different anon_vma (or perhaps to a different * offset in the same anon_vma). do_swap_page() cannot do all the locking * needed to reconstitute a cross-anon_vma KSM page: for now it has to make * a copy, and leave remerging the pages to a later pass of ksmd. * * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ static inline void ksm_add_vma(struct vm_area_struct *vma) { } static inline int ksm_disable(struct mm_struct *mm) { return 0; } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { } static inline int ksm_execve(struct mm_struct *mm) { return 0; } static inline void ksm_exit(struct mm_struct *mm) { } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } static inline void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { } #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { return 0; } static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { return folio; } static inline void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { } static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ #endif /* __LINUX_KSM_H */
434 272 10 294 440 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H #include <linux/sizes.h> #include <linux/compiler_types.h> #include "ctree.h" #include "fs.h" struct block_device; struct super_block; struct extent_buffer; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info; struct btrfs_super_block; struct btrfs_trans_handle; struct btrfs_tree_parent_check; struct btrfs_transaction; #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 /* * Fixed blocksize for all devices, applies to specific ways of reading * metadata like superblock. Must meet the set_blocksize requirements. * * Do not change. */ #define BTRFS_BDEV_BLOCKSIZE (4096) static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; if (mirror) return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); return BTRFS_SUPER_INFO_OFFSET; } void btrfs_check_leaked_roots(const struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices); void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, dev_t *anon_dev); struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 objectid); int btrfs_global_root_insert(struct btrfs_root *root); void btrfs_global_root_delete(struct btrfs_root *root); struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); int btrfs_validate_extent_buffer(struct extent_buffer *eb, const struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif /* * This function is used to grab the root, and avoid it is freed when we * access it. But it doesn't ensure that the tree is not dropped. */ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) { if (!root) return NULL; if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); #endif
127 103 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_CLOCK_H #define _BCACHEFS_CLOCK_H void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); void __bch2_increment_clock(struct io_clock *, u64); static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, int rw) { struct io_clock *clock = &c->io_clock[rw]; if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= IO_CLOCK_PCPU_SECTORS)) __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); } void bch2_io_clock_schedule_timeout(struct io_clock *, u64); void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); void bch2_io_clock_exit(struct io_clock *); int bch2_io_clock_init(struct io_clock *); #endif /* _BCACHEFS_CLOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_NULLS_H #define _LINUX_LIST_NULLS_H #include <linux/poison.h> #include <linux/const.h> /* * Special version of lists, where end of list is not a NULL pointer, * but a 'nulls' marker, which can have many different values. * (up to 2^31 different values guaranteed on all platforms) * * In the standard hlist, termination of a list is the NULL pointer. * In this special 'nulls' variant, we use the fact that objects stored in * a list are aligned on a word (4 or 8 bytes alignment). * We therefore use the last significant bit of 'ptr' : * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) * Set to 0 : This is a pointer to some object (ptr) */ struct hlist_nulls_head { struct hlist_nulls_node *first; }; struct hlist_nulls_node { struct hlist_nulls_node *next, **pprev; }; #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_nulls_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ }) /** * ptr_is_a_nulls - Test if a ptr is a nulls * @ptr: ptr to be tested * */ static inline int is_a_nulls(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr & 1); } /** * get_nulls_value - Get the 'nulls' value of the end of chain * @ptr: end of chain * * Should be called only if is_a_nulls(ptr); */ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr) >> 1; } /** * hlist_nulls_unhashed - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. */ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) { return !h->pprev; } /** * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this * function may be used locklessly. */ static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) { return !READ_ONCE(h->pprev); } static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { return is_a_nulls(READ_ONCE(h->first)); } static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); h->first = n; if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } static inline void __hlist_nulls_del(struct hlist_nulls_node *n) { struct hlist_nulls_node *next = n->next; struct hlist_nulls_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (!is_a_nulls(next)) WRITE_ONCE(next->pprev, pprev); } static inline void hlist_nulls_del(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry_from(tpos, pos, member) \ for (; (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) #endif
8 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_NAMEI_H #define _BCACHEFS_NAMEI_H #include "dirent.h" struct posix_acl; #define BCH_CREATE_TMPFILE (1U << 0) #define BCH_CREATE_SUBVOL (1U << 1) #define BCH_CREATE_SNAPSHOT (1U << 2) #define BCH_CREATE_SNAPSHOT_RO (1U << 3) int bch2_create_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, uid_t, gid_t, umode_t, dev_t, struct posix_acl *, struct posix_acl *, subvol_inum, unsigned); int bch2_link_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, subvol_inum, struct bch_inode_unpacked *, const struct qstr *); int bch2_unlink_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, bool); int bch2_rename_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, const struct qstr *, enum bch_rename_mode); bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); int __bch2_check_dirent_target(struct btree_trans *, struct btree_iter *, struct bkey_s_c_dirent, struct bch_inode_unpacked *, bool); static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, struct bkey_s_c_dirent d) { return inode->bi_dir == d.k->p.inode && inode->bi_dir_offset == d.k->p.offset; } static inline int bch2_check_dirent_target(struct btree_trans *trans, struct btree_iter *dirent_iter, struct bkey_s_c_dirent d, struct bch_inode_unpacked *target, bool in_fsck) { if (likely(inode_points_to_dirent(target, d) && d.v->d_type == inode_d_type(target))) return 0; return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); } #endif /* _BCACHEFS_NAMEI_H */
4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 4 4 3 3 4 4 4 2 3 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 // SPDX-License-Identifier: GPL-2.0 /* * List pending timers * * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar */ #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/nmi.h> #include <linux/uaccess.h> #include "tick-internal.h" struct timer_list_iter { int cpu; bool second_pass; u64 now; }; /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): */ __printf(2, 3) static void SEQ_printf(struct seq_file *m, const char *fmt, ...) { va_list args; va_start(args, fmt); if (m) seq_vprintf(m, fmt, args); else vprintk(fmt, args); va_end(args); } static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { struct hrtimer *timer, tmp; unsigned long next = 0, i; struct timerqueue_node *curr; unsigned long flags; next_one: i = 0; touch_nmi_watchdog(); raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = timerqueue_getnext(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { curr = timerqueue_iterate_next(curr); i++; } if (curr) { timer = container_of(curr, struct hrtimer, node); tmp = *timer; raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); print_timer(m, timer, &tmp, i, now); next++; goto next_one; } raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); } static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now + ktime_to_ns(base->offset)); } static void print_cpu(struct seq_file *m, int cpu, u64 now) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; SEQ_printf(m, "cpu: %d\n", cpu); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { SEQ_printf(m, " clock %d:\n", i); print_base(m, cpu_base->clock_base + i, now); } #define P(x) \ SEQ_printf(m, " .%-15s: %Lu\n", #x, \ (unsigned long long)(cpu_base->x)) #define P_ns(x) \ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ (unsigned long long)(ktime_to_ns(cpu_base->x))) #ifdef CONFIG_HIGH_RES_TIMERS P_ns(expires_next); P(hres_active); P(nr_events); P(nr_retries); P(nr_hangs); P(max_hang_time); #endif #undef P #undef P_ns #ifdef CONFIG_TICK_ONESHOT # define P(x) \ SEQ_printf(m, " .%-15s: %Lu\n", #x, \ (unsigned long long)(ts->x)) # define P_ns(x) \ SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ (unsigned long long)(ktime_to_ns(ts->x))) # define P_flag(x, f) \ SEQ_printf(m, " .%-15s: %d\n", #x, !!(ts->flags & (f))) { struct tick_sched *ts = tick_get_tick_sched(cpu); P_flag(nohz, TS_FLAG_NOHZ); P_flag(highres, TS_FLAG_HIGHRES); P_ns(last_tick); P_flag(tick_stopped, TS_FLAG_STOPPED); P(idle_jiffies); P(idle_calls); P(idle_sleeps); P_ns(idle_entrytime); P_ns(idle_waketime); P_ns(idle_exittime); P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); } #endif #undef P #undef P_ns SEQ_printf(m, "\n"); } #ifdef CONFIG_GENERIC_CLOCKEVENTS static void print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; touch_nmi_watchdog(); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); else SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { SEQ_printf(m, "<NULL>\n"); return; } SEQ_printf(m, "%s\n", dev->name); SEQ_printf(m, " max_delta_ns: %llu\n", (unsigned long long) dev->max_delta_ns); SEQ_printf(m, " min_delta_ns: %llu\n", (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event); if (dev->set_state_shutdown) SEQ_printf(m, " shutdown: %ps\n", dev->set_state_shutdown); if (dev->set_state_periodic) SEQ_printf(m, " periodic: %ps\n", dev->set_state_periodic); if (dev->set_state_oneshot) SEQ_printf(m, " oneshot: %ps\n", dev->set_state_oneshot); if (dev->set_state_oneshot_stopped) SEQ_printf(m, " oneshot stopped: %ps\n", dev->set_state_oneshot_stopped); if (dev->tick_resume) SEQ_printf(m, " resume: %ps\n", dev->tick_resume); SEQ_printf(m, " event_handler: %ps\n", dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST if (cpu >= 0) { const struct clock_event_device *wd = tick_get_wakeup_device(cpu); SEQ_printf(m, "Wakeup Device: %s\n", wd ? wd->name : "<NULL>"); } #endif SEQ_printf(m, "\n"); } static void timer_list_show_tickdevices_header(struct seq_file *m) { #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %*pb\n", cpumask_pr_args(tick_get_broadcast_mask())); #ifdef CONFIG_TICK_ONESHOT SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", cpumask_pr_args(tick_get_broadcast_oneshot_mask())); #endif SEQ_printf(m, "\n"); #endif } #endif static inline void timer_list_header(struct seq_file *m, u64 now) { SEQ_printf(m, "Timer List Version: v0.10\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); } void sysrq_timer_list_show(void) { u64 now = ktime_to_ns(ktime_get()); int cpu; timer_list_header(NULL, now); for_each_online_cpu(cpu) print_cpu(NULL, cpu, now); #ifdef CONFIG_GENERIC_CLOCKEVENTS timer_list_show_tickdevices_header(NULL); for_each_online_cpu(cpu) print_tickdevice(NULL, tick_get_device(cpu), cpu); #endif return; } #ifdef CONFIG_PROC_FS static int timer_list_show(struct seq_file *m, void *v) { struct timer_list_iter *iter = v; if (iter->cpu == -1 && !iter->second_pass) timer_list_header(m, iter->now); else if (!iter->second_pass) print_cpu(m, iter->cpu, iter->now); #ifdef CONFIG_GENERIC_CLOCKEVENTS else if (iter->cpu == -1 && iter->second_pass) timer_list_show_tickdevices_header(m); else print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); #endif return 0; } static void *move_iter(struct timer_list_iter *iter, loff_t offset) { for (; offset; offset--) { iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); if (iter->cpu >= nr_cpu_ids) { #ifdef CONFIG_GENERIC_CLOCKEVENTS if (!iter->second_pass) { iter->cpu = -1; iter->second_pass = true; } else return NULL; #else return NULL; #endif } } return iter; } static void *timer_list_start(struct seq_file *file, loff_t *offset) { struct timer_list_iter *iter = file->private; if (!*offset) iter->now = ktime_to_ns(ktime_get()); iter->cpu = -1; iter->second_pass = false; return move_iter(iter, *offset); } static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) { struct timer_list_iter *iter = file->private; ++*offset; return move_iter(iter, 1); } static void timer_list_stop(struct seq_file *seq, void *v) { } static const struct seq_operations timer_list_sops = { .start = timer_list_start, .next = timer_list_next, .stop = timer_list_stop, .show = timer_list_show, }; static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; pe = proc_create_seq_private("timer_list", 0400, NULL, &timer_list_sops, sizeof(struct timer_list_iter), NULL); if (!pe) return -ENOMEM; return 0; } __initcall(init_timer_list_procfs); #endif
8 8 8 8 8 21 112 147 143 19 16 88 10 16 93 93 93 93 93 93 143 143 19 19 5 5 19 24 24 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_DISK_ACCOUNTING_H #define _BCACHEFS_DISK_ACCOUNTING_H #include "btree_update.h" #include "eytzinger.h" #include "sb-members.h" static inline void bch2_u64s_neg(u64 *v, unsigned nr) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; } static inline unsigned bch2_accounting_counters(const struct bkey *k) { return bkey_val_u64s(k) - offsetof(struct bch_accounting, d) / sizeof(u64); } static inline void bch2_accounting_neg(struct bkey_s_accounting a) { bch2_u64s_neg(a.v->d, bch2_accounting_counters(a.k)); } static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) { for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) if (a.v->d[i]) return false; return true; } static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, struct bkey_s_c_accounting src) { for (unsigned i = 0; i < min(bch2_accounting_counters(&dst->k), bch2_accounting_counters(src.k)); i++) dst->v.d[i] += src.v->d[i]; if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) dst->k.bversion = src.k->bversion; } static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, enum bch_data_type data_type, s64 sectors) { switch (data_type) { case BCH_DATA_btree: fs_usage->btree += sectors; break; case BCH_DATA_user: case BCH_DATA_parity: fs_usage->data += sectors; break; case BCH_DATA_cached: fs_usage->cached += sectors; break; default: break; } } static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) { BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ acc->_pad = p; #else memcpy_swab(acc, &p, sizeof(p)); #endif } static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) { struct bpos p; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ p = acc->_pad; #else memcpy_swab(&p, acc, sizeof(p)); #endif return p; } int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); #define disk_accounting_key_init(_k, _type, ...) \ do { \ memset(&(_k), 0, sizeof(_k)); \ (_k).type = BCH_DISK_ACCOUNTING_##_type; \ (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ } while (0) #define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ ({ \ struct disk_accounting_pos pos; \ disk_accounting_key_init(pos, __VA_ARGS__); \ bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ }) #define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_swab(struct bkey_s); #define bch2_bkey_ops_accounting ((struct bkey_ops) { \ .key_validate = bch2_accounting_validate, \ .val_to_text = bch2_accounting_to_text, \ .swab = bch2_accounting_swab, \ .min_val_size = 8, \ }) int bch2_accounting_update_sb(struct btree_trans *); static inline int accounting_pos_cmp(const void *_l, const void *_r) { const struct bpos *l = _l, *r = _r; return bpos_cmp(*l, *r); } enum bch_accounting_mode { BCH_ACCOUNTING_normal, BCH_ACCOUNTING_gc, BCH_ACCOUNTING_read, }; int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) { return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && acc.type != BCH_DISK_ACCOUNTING_inum; } /* * Update in memory counters so they match the btree update we're doing; called * from transaction commit path */ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { struct bch_fs *c = trans->c; struct bch_accounting_mem *acc = &c->accounting; struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, a.k->p); bool gc = mode == BCH_ACCOUNTING_gc; if (gc && !acc->gc_running) return 0; if (!bch2_accounting_is_mem(acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { switch (acc_k.type) { case BCH_DISK_ACCOUNTING_persistent_reserved: trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; break; case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); } rcu_read_unlock(); break; } } unsigned idx; while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { int ret = bch2_accounting_mem_insert(c, a, mode); if (ret) return ret; } struct accounting_mem_entry *e = &acc->k.data[idx]; EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters); for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++) this_cpu_add(e->v[gc][i], a.v->d[i]); return 0; } static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); percpu_up_read(&trans->c->mark_lock); return ret; } static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, unsigned idx, u64 *v, unsigned nr, bool gc) { memset(v, 0, sizeof(*v) * nr); if (unlikely(idx >= acc->k.nr)) return; struct accounting_mem_entry *e = &acc->k.data[idx]; nr = min_t(unsigned, nr, e->nr_counters); for (unsigned i = 0; i < nr; i++) v[i] = percpu_u64_get(e->v[gc] + i); } static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { percpu_down_read(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); bch2_accounting_mem_read_counters(acc, idx, v, nr, false); percpu_up_read(&c->mark_lock); } static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) { EBUG_ON(!res->ref); return (struct bversion) { .hi = res->seq >> 32, .lo = (res->seq << 32) | (res->offset + offset), }; } static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, struct bkey_i_accounting *a, unsigned commit_flags) { a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - (u64 *) trans->journal_entries); EBUG_ON(bversion_zero(a->k.bversion)); return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) : 0; } static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, struct bkey_i_accounting *a_i, unsigned commit_flags) { if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { struct bkey_s_accounting a = accounting_i_to_s(a_i); bch2_accounting_neg(a); bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); bch2_accounting_neg(a); } } int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); int bch2_gc_accounting_start(struct bch_fs *); int bch2_gc_accounting_done(struct bch_fs *); int bch2_accounting_read(struct bch_fs *); int bch2_dev_usage_remove(struct bch_fs *, unsigned); int bch2_dev_usage_init(struct bch_dev *, bool); void bch2_verify_accounting_clean(struct bch_fs *c); void bch2_accounting_gc_free(struct bch_fs *); void bch2_fs_accounting_exit(struct bch_fs *); #endif /* _BCACHEFS_DISK_ACCOUNTING_H */
18 1 17 15 9 13 2 10 14 10 2 9 1 10 11 2 11 2 3 1 2 9 1 13 11 1 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 // SPDX-License-Identifier: GPL-2.0-only /* * LZO1X Decompressor from LZO * * Copyright (C) 1996-2012 Markus F.X.J. Oberhumer <markus@oberhumer.com> * * The full LZO package can be found at: * http://www.oberhumer.com/opensource/lzo/ * * Changed for Linux kernel use by: * Nitin Gupta <nitingupta910@gmail.com> * Richard Purdie <rpurdie@openedhand.com> */ #ifndef STATIC #include <linux/module.h> #include <linux/kernel.h> #endif #include <linux/unaligned.h> #include <linux/lzo.h> #include "lzodefs.h" #define HAVE_IP(x) ((size_t)(ip_end - ip) >= (size_t)(x)) #define HAVE_OP(x) ((size_t)(op_end - op) >= (size_t)(x)) #define NEED_IP(x) if (!HAVE_IP(x)) goto input_overrun #define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun #define TEST_LB(m_pos) if ((m_pos) < out) goto lookbehind_overrun /* This MAX_255_COUNT is the maximum number of times we can add 255 to a base * count without overflowing an integer. The multiply will overflow when * multiplying 255 by more than MAXINT/255. The sum will overflow earlier * depending on the base count. Since the base count is taken from a u8 * and a few bits, it is safe to assume that it will always be lower than * or equal to 2*255, thus we can always prevent any overflow by accepting * two less 255 steps. See Documentation/staging/lzo.rst for more information. */ #define MAX_255_COUNT ((((size_t)~0) / 255) - 2) int lzo1x_decompress_safe(const unsigned char *in, size_t in_len, unsigned char *out, size_t *out_len) { unsigned char *op; const unsigned char *ip; size_t t, next; size_t state = 0; const unsigned char *m_pos; const unsigned char * const ip_end = in + in_len; unsigned char * const op_end = out + *out_len; unsigned char bitstream_version; op = out; ip = in; if (unlikely(in_len < 3)) goto input_overrun; if (likely(in_len >= 5) && likely(*ip == 17)) { bitstream_version = ip[1]; ip += 2; } else { bitstream_version = 0; } if (*ip > 17) { t = *ip++ - 17; if (t < 4) { next = t; goto match_next; } goto copy_literal_run; } for (;;) { t = *ip++; if (t < 16) { if (likely(state == 0)) { if (unlikely(t == 0)) { size_t offset; const unsigned char *ip_last = ip; while (unlikely(*ip == 0)) { ip++; NEED_IP(1); } offset = ip - ip_last; if (unlikely(offset > MAX_255_COUNT)) return LZO_E_ERROR; offset = (offset << 8) - offset; t += offset + 15 + *ip++; } t += 3; copy_literal_run: #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) if (likely(HAVE_IP(t + 15) && HAVE_OP(t + 15))) { const unsigned char *ie = ip + t; unsigned char *oe = op + t; do { COPY8(op, ip); op += 8; ip += 8; COPY8(op, ip); op += 8; ip += 8; } while (ip < ie); ip = ie; op = oe; } else #endif { NEED_OP(t); NEED_IP(t + 3); do { *op++ = *ip++; } while (--t > 0); } state = 4; continue; } else if (state != 4) { next = t & 3; m_pos = op - 1; m_pos -= t >> 2; m_pos -= *ip++ << 2; TEST_LB(m_pos); NEED_OP(2); op[0] = m_pos[0]; op[1] = m_pos[1]; op += 2; goto match_next; } else { next = t & 3; m_pos = op - (1 + M2_MAX_OFFSET); m_pos -= t >> 2; m_pos -= *ip++ << 2; t = 3; } } else if (t >= 64) { next = t & 3; m_pos = op - 1; m_pos -= (t >> 2) & 7; m_pos -= *ip++ << 3; t = (t >> 5) - 1 + (3 - 1); } else if (t >= 32) { t = (t & 31) + (3 - 1); if (unlikely(t == 2)) { size_t offset; const unsigned char *ip_last = ip; while (unlikely(*ip == 0)) { ip++; NEED_IP(1); } offset = ip - ip_last; if (unlikely(offset > MAX_255_COUNT)) return LZO_E_ERROR; offset = (offset << 8) - offset; t += offset + 31 + *ip++; NEED_IP(2); } m_pos = op - 1; next = get_unaligned_le16(ip); ip += 2; m_pos -= next >> 2; next &= 3; } else { NEED_IP(2); next = get_unaligned_le16(ip); if (((next & 0xfffc) == 0xfffc) && ((t & 0xf8) == 0x18) && likely(bitstream_version)) { NEED_IP(3); t &= 7; t |= ip[2] << 3; t += MIN_ZERO_RUN_LENGTH; NEED_OP(t); memset(op, 0, t); op += t; next &= 3; ip += 3; goto match_next; } else { m_pos = op; m_pos -= (t & 8) << 11; t = (t & 7) + (3 - 1); if (unlikely(t == 2)) { size_t offset; const unsigned char *ip_last = ip; while (unlikely(*ip == 0)) { ip++; NEED_IP(1); } offset = ip - ip_last; if (unlikely(offset > MAX_255_COUNT)) return LZO_E_ERROR; offset = (offset << 8) - offset; t += offset + 7 + *ip++; NEED_IP(2); next = get_unaligned_le16(ip); } ip += 2; m_pos -= next >> 2; next &= 3; if (m_pos == op) goto eof_found; m_pos -= 0x4000; } } TEST_LB(m_pos); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) if (op - m_pos >= 8) { unsigned char *oe = op + t; if (likely(HAVE_OP(t + 15))) { do { COPY8(op, m_pos); op += 8; m_pos += 8; COPY8(op, m_pos); op += 8; m_pos += 8; } while (op < oe); op = oe; if (HAVE_IP(6)) { state = next; COPY4(op, ip); op += next; ip += next; continue; } } else { NEED_OP(t); do { *op++ = *m_pos++; } while (op < oe); } } else #endif { unsigned char *oe = op + t; NEED_OP(t); op[0] = m_pos[0]; op[1] = m_pos[1]; op += 2; m_pos += 2; do { *op++ = *m_pos++; } while (op < oe); } match_next: state = next; t = next; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) if (likely(HAVE_IP(6) && HAVE_OP(4))) { COPY4(op, ip); op += t; ip += t; } else #endif { NEED_IP(t + 3); NEED_OP(t); while (t > 0) { *op++ = *ip++; t--; } } } eof_found: *out_len = op - out; return (t != 3 ? LZO_E_ERROR : ip == ip_end ? LZO_E_OK : ip < ip_end ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN); input_overrun: *out_len = op - out; return LZO_E_INPUT_OVERRUN; output_overrun: *out_len = op - out; return LZO_E_OUTPUT_OVERRUN; lookbehind_overrun: *out_len = op - out; return LZO_E_LOOKBEHIND_OVERRUN; } #ifndef STATIC EXPORT_SYMBOL_GPL(lzo1x_decompress_safe); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LZO1X Decompressor"); #endif
128 175 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NLS_H #define _LINUX_NLS_H #include <linux/init.h> /* Unicode has changed over the years. Unicode code points no longer * fit into 16 bits; as of Unicode 5 valid code points range from 0 * to 0x10ffff (17 planes, where each plane holds 65536 code points). * * The original decision to represent Unicode characters as 16-bit * wchar_t values is now outdated. But plane 0 still includes the * most commonly used characters, so we will retain it. The newer * 32-bit unicode_t type can be used when it is necessary to * represent the full Unicode character set. */ /* Plane-0 Unicode character */ typedef u16 wchar_t; #define MAX_WCHAR_T 0xffff /* Arbitrary Unicode character */ typedef u32 unicode_t; struct nls_table { const char *charset; const char *alias; int (*uni2char) (wchar_t uni, unsigned char *out, int boundlen); int (*char2uni) (const unsigned char *rawstring, int boundlen, wchar_t *uni); const unsigned char *charset2lower; const unsigned char *charset2upper; struct module *owner; struct nls_table *next; }; /* this value hold the maximum octet of charset */ #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */ /* Byte order for UTF-16 strings */ enum utf16_endian { UTF16_HOST_ENDIAN, UTF16_LITTLE_ENDIAN, UTF16_BIG_ENDIAN }; /* nls_base.c */ extern int __register_nls(struct nls_table *, struct module *); extern int unregister_nls(struct nls_table *); extern struct nls_table *load_nls(const char *charset); extern void unload_nls(struct nls_table *); extern struct nls_table *load_nls_default(void); #define register_nls(nls) __register_nls((nls), THIS_MODULE) extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu); extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen); extern int utf8s_to_utf16s(const u8 *s, int len, enum utf16_endian endian, wchar_t *pwcs, int maxlen); extern int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, u8 *s, int maxlen); static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) { unsigned char nc = t->charset2lower[c]; return nc ? nc : c; } static inline unsigned char nls_toupper(struct nls_table *t, unsigned char c) { unsigned char nc = t->charset2upper[c]; return nc ? nc : c; } static inline int nls_strnicmp(struct nls_table *t, const unsigned char *s1, const unsigned char *s2, int len) { while (len--) { if (nls_tolower(t, *s1++) != nls_tolower(t, *s2++)) return 1; } return 0; } /* * nls_nullsize - return length of null character for codepage * @codepage - codepage for which to return length of NULL terminator * * Since we can't guarantee that the null terminator will be a particular * length, we have to check against the codepage. If there's a problem * determining it, assume a single-byte NULL terminator. */ static inline int nls_nullsize(const struct nls_table *codepage) { int charlen; char tmp[NLS_MAX_CHARSET_SIZE]; charlen = codepage->uni2char(0, tmp, NLS_MAX_CHARSET_SIZE); return charlen > 0 ? charlen : 1; } #define MODULE_ALIAS_NLS(name) MODULE_ALIAS("nls_" __stringify(name)) #endif /* _LINUX_NLS_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * * Authors: Artem Bityutskiy (Битюцкий Артём), Thomas Gleixner */ /* * UBI wear-leveling sub-system. * * This sub-system is responsible for wear-leveling. It works in terms of * physical eraseblocks and erase counters and knows nothing about logical * eraseblocks, volumes, etc. From this sub-system's perspective all physical * eraseblocks are of two types - used and free. Used physical eraseblocks are * those that were "get" by the 'ubi_wl_get_peb()' function, and free physical * eraseblocks are those that were put by the 'ubi_wl_put_peb()' function. * * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter * header. The rest of the physical eraseblock contains only %0xFF bytes. * * When physical eraseblocks are returned to the WL sub-system by means of the * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is * done asynchronously in context of the per-UBI device background thread, * which is also managed by the WL sub-system. * * The wear-leveling is ensured by means of moving the contents of used * physical eraseblocks with low erase counter to free physical eraseblocks * with high erase counter. * * If the WL sub-system fails to erase a physical eraseblock, it marks it as * bad. * * This sub-system is also responsible for scrubbing. If a bit-flip is detected * in a physical eraseblock, it has to be moved. Technically this is the same * as moving it for wear-leveling reasons. * * As it was said, for the UBI sub-system all physical eraseblocks are either * "free" or "used". Free eraseblock are kept in the @wl->free RB-tree, while * used eraseblocks are kept in @wl->used, @wl->erroneous, or @wl->scrub * RB-trees, as well as (temporarily) in the @wl->pq queue. * * When the WL sub-system returns a physical eraseblock, the physical * eraseblock is protected from being moved for some "time". For this reason, * the physical eraseblock is not directly moved from the @wl->free tree to the * @wl->used tree. There is a protection queue in between where this * physical eraseblock is temporarily stored (@wl->pq). * * All this protection stuff is needed because: * o we don't want to move physical eraseblocks just after we have given them * to the user; instead, we first want to let users fill them up with data; * * o there is a chance that the user will put the physical eraseblock very * soon, so it makes sense not to move it for some time, but wait. * * Physical eraseblocks stay protected only for limited time. But the "time" is * measured in erase cycles in this case. This is implemented with help of the * protection queue. Eraseblocks are put to the tail of this queue when they * are returned by the 'ubi_wl_get_peb()', and eraseblocks are removed from the * head of the queue on each erase operation (for any eraseblock). So the * length of the queue defines how may (global) erase cycles PEBs are protected. * * To put it differently, each physical eraseblock has 2 main states: free and * used. The former state corresponds to the @wl->free tree. The latter state * is split up on several sub-states: * o the WL movement is allowed (@wl->used tree); * o the WL movement is disallowed (@wl->erroneous) because the PEB is * erroneous - e.g., there was a read error; * o the WL movement is temporarily prohibited (@wl->pq queue); * o scrubbing is needed (@wl->scrub tree). * * Depending on the sub-state, wear-leveling entries of the used physical * eraseblocks may be kept in one of those structures. * * Note, in this implementation, we keep a small in-RAM object for each physical * eraseblock. This is surely not a scalable solution. But it appears to be good * enough for moderately large flashes and it is simple. In future, one may * re-work this sub-system and make it more scalable. * * At the moment this sub-system does not utilize the sequence number, which * was introduced relatively recently. But it would be wise to do this because * the sequence number of a logical eraseblock characterizes how old is it. For * example, when we move a PEB with low erase counter, and we need to pick the * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we * pick target PEB with an average EC if our PEB is not very "old". This is a * room for future re-works of the WL sub-system. */ #include <linux/slab.h> #include <linux/crc32.h> #include <linux/freezer.h> #include <linux/kthread.h> #include "ubi.h" #include "wl.h" /* Number of physical eraseblocks reserved for wear-leveling purposes */ #define WL_RESERVED_PEBS 1 /* * Maximum difference between two erase counters. If this threshold is * exceeded, the WL sub-system starts moving data from used physical * eraseblocks with low erase counter to free physical eraseblocks with high * erase counter. */ #define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD /* * When a physical eraseblock is moved, the WL sub-system has to pick the target * physical eraseblock to move to. The simplest way would be just to pick the * one with the highest erase counter. But in certain workloads this could lead * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a * situation when the picked physical eraseblock is constantly erased after the * data is written to it. So, we have a constant which limits the highest erase * counter of the free physical eraseblock to pick. Namely, the WL sub-system * does not pick eraseblocks with erase counter greater than the lowest erase * counter plus %WL_FREE_MAX_DIFF. */ #define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD) /* * Maximum number of consecutive background thread failures which is enough to * switch to read-only mode. */ #define WL_MAX_FAILURES 32 static int self_check_ec(struct ubi_device *ubi, int pnum, int ec); static int self_check_in_wl_tree(const struct ubi_device *ubi, struct ubi_wl_entry *e, struct rb_root *root); static int self_check_in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e); /** * wl_tree_add - add a wear-leveling entry to a WL RB-tree. * @e: the wear-leveling entry to add * @root: the root of the tree * * Note, we use (erase counter, physical eraseblock number) pairs as keys in * the @ubi->used and @ubi->free RB-trees. */ static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root) { struct rb_node **p, *parent = NULL; p = &root->rb_node; while (*p) { struct ubi_wl_entry *e1; parent = *p; e1 = rb_entry(parent, struct ubi_wl_entry, u.rb); if (e->ec < e1->ec) p = &(*p)->rb_left; else if (e->ec > e1->ec) p = &(*p)->rb_right; else { ubi_assert(e->pnum != e1->pnum); if (e->pnum < e1->pnum) p = &(*p)->rb_left; else p = &(*p)->rb_right; } } rb_link_node(&e->u.rb, parent, p); rb_insert_color(&e->u.rb, root); } /** * wl_entry_destroy - destroy a wear-leveling entry. * @ubi: UBI device description object * @e: the wear-leveling entry to add * * This function destroys a wear leveling entry and removes * the reference from the lookup table. */ static void wl_entry_destroy(struct ubi_device *ubi, struct ubi_wl_entry *e) { ubi->lookuptbl[e->pnum] = NULL; kmem_cache_free(ubi_wl_entry_slab, e); } /** * do_work - do one pending work. * @ubi: UBI device description object * @executed: whether there is one work is executed * * This function returns zero in case of success and a negative error code in * case of failure. If @executed is not NULL and there is one work executed, * @executed is set as %1, otherwise @executed is set as %0. */ static int do_work(struct ubi_device *ubi, int *executed) { int err; struct ubi_work *wrk; cond_resched(); /* * @ubi->work_sem is used to synchronize with the workers. Workers take * it in read mode, so many of them may be doing works at a time. But * the queue flush code has to be sure the whole queue of works is * done, and it takes the mutex in write mode. */ down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works)) { spin_unlock(&ubi->wl_lock); up_read(&ubi->work_sem); if (executed) *executed = 0; return 0; } if (executed) *executed = 1; wrk = list_entry(ubi->works.next, struct ubi_work, list); list_del(&wrk->list); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); spin_unlock(&ubi->wl_lock); /* * Call the worker function. Do not touch the work structure * after this call as it will have been freed or reused by that * time by the worker function. */ err = wrk->func(ubi, wrk, 0); if (err) ubi_err(ubi, "work failed with error code %d", err); up_read(&ubi->work_sem); return err; } /** * in_wl_tree - check if wear-leveling entry is present in a WL RB-tree. * @e: the wear-leveling entry to check * @root: the root of the tree * * This function returns non-zero if @e is in the @root RB-tree and zero if it * is not. */ static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root) { struct rb_node *p; p = root->rb_node; while (p) { struct ubi_wl_entry *e1; e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e->pnum == e1->pnum) { ubi_assert(e == e1); return 1; } if (e->ec < e1->ec) p = p->rb_left; else if (e->ec > e1->ec) p = p->rb_right; else { ubi_assert(e->pnum != e1->pnum); if (e->pnum < e1->pnum) p = p->rb_left; else p = p->rb_right; } } return 0; } /** * in_pq - check if a wear-leveling entry is present in the protection queue. * @ubi: UBI device description object * @e: the wear-leveling entry to check * * This function returns non-zero if @e is in the protection queue and zero * if it is not. */ static inline int in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e) { struct ubi_wl_entry *p; int i; for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) list_for_each_entry(p, &ubi->pq[i], u.list) if (p == e) return 1; return 0; } /** * prot_queue_add - add physical eraseblock to the protection queue. * @ubi: UBI device description object * @e: the physical eraseblock to add * * This function adds @e to the tail of the protection queue @ubi->pq, where * @e will stay for %UBI_PROT_QUEUE_LEN erase operations and will be * temporarily protected from the wear-leveling worker. Note, @wl->lock has to * be locked. */ static void prot_queue_add(struct ubi_device *ubi, struct ubi_wl_entry *e) { int pq_tail = ubi->pq_head - 1; if (pq_tail < 0) pq_tail = UBI_PROT_QUEUE_LEN - 1; ubi_assert(pq_tail >= 0 && pq_tail < UBI_PROT_QUEUE_LEN); list_add_tail(&e->u.list, &ubi->pq[pq_tail]); dbg_wl("added PEB %d EC %d to the protection queue", e->pnum, e->ec); } /** * find_wl_entry - find wear-leveling entry closest to certain erase counter. * @ubi: UBI device description object * @root: the RB-tree where to look for * @diff: maximum possible difference from the smallest erase counter * @pick_max: pick PEB even its erase counter beyonds 'min_ec + @diff' * * This function looks for a wear leveling entry with erase counter closest to * min + @diff, where min is the smallest erase counter. */ static struct ubi_wl_entry *find_wl_entry(struct ubi_device *ubi, struct rb_root *root, int diff, int pick_max) { struct rb_node *p; struct ubi_wl_entry *e; int max; e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); max = e->ec + diff; p = root->rb_node; while (p) { struct ubi_wl_entry *e1; e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e1->ec >= max) { if (pick_max) e = e1; p = p->rb_left; } else { p = p->rb_right; e = e1; } } return e; } /** * find_mean_wl_entry - find wear-leveling entry with medium erase counter. * @ubi: UBI device description object * @root: the RB-tree where to look for * * This function looks for a wear leveling entry with medium erase counter, * but not greater or equivalent than the lowest erase counter plus * %WL_FREE_MAX_DIFF/2. */ static struct ubi_wl_entry *find_mean_wl_entry(struct ubi_device *ubi, struct rb_root *root) { struct ubi_wl_entry *e, *first, *last; first = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); last = rb_entry(rb_last(root), struct ubi_wl_entry, u.rb); if (last->ec - first->ec < WL_FREE_MAX_DIFF) { e = rb_entry(root->rb_node, struct ubi_wl_entry, u.rb); /* * If no fastmap has been written and fm_anchor is not * reserved and this WL entry can be used as anchor PEB * hold it back and return the second best WL entry such * that fastmap can use the anchor PEB later. */ e = may_reserve_for_fm(ubi, e, root); } else e = find_wl_entry(ubi, root, WL_FREE_MAX_DIFF/2, 0); return e; } /** * wl_get_wle - get a mean wl entry to be used by ubi_wl_get_peb() or * refill_wl_user_pool(). * @ubi: UBI device description object * * This function returns a wear leveling entry in case of success and * NULL in case of failure. */ static struct ubi_wl_entry *wl_get_wle(struct ubi_device *ubi) { struct ubi_wl_entry *e; e = find_mean_wl_entry(ubi, &ubi->free); if (!e) { ubi_err(ubi, "no free eraseblocks"); return NULL; } self_check_in_wl_tree(ubi, e, &ubi->free); /* * Move the physical eraseblock to the protection queue where it will * be protected from being moved for some time. */ rb_erase(&e->u.rb, &ubi->free); ubi->free_count--; dbg_wl("PEB %d EC %d", e->pnum, e->ec); return e; } /** * prot_queue_del - remove a physical eraseblock from the protection queue. * @ubi: UBI device description object * @pnum: the physical eraseblock to remove * * This function deletes PEB @pnum from the protection queue and returns zero * in case of success and %-ENODEV if the PEB was not found. */ static int prot_queue_del(struct ubi_device *ubi, int pnum) { struct ubi_wl_entry *e; e = ubi->lookuptbl[pnum]; if (!e) return -ENODEV; if (self_check_in_pq(ubi, e)) return -ENODEV; list_del(&e->u.list); dbg_wl("deleted PEB %d from the protection queue", e->pnum); return 0; } /** * ubi_sync_erase - synchronously erase a physical eraseblock. * @ubi: UBI device description object * @e: the physical eraseblock to erase * @torture: if the physical eraseblock has to be tortured * * This function returns zero in case of success and a negative error code in * case of failure. */ int ubi_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture) { int err; struct ubi_ec_hdr *ec_hdr; unsigned long long ec = e->ec; dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec); err = self_check_ec(ubi, e->pnum, e->ec); if (err) return -EINVAL; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); if (!ec_hdr) return -ENOMEM; err = ubi_io_sync_erase(ubi, e->pnum, torture); if (err < 0) goto out_free; ec += err; if (ec > UBI_MAX_ERASECOUNTER) { /* * Erase counter overflow. Upgrade UBI and use 64-bit * erase counters internally. */ ubi_err(ubi, "erase counter overflow at PEB %d, EC %llu", e->pnum, ec); err = -EINVAL; goto out_free; } dbg_wl("erased PEB %d, new EC %llu", e->pnum, ec); ec_hdr->ec = cpu_to_be64(ec); err = ubi_io_write_ec_hdr(ubi, e->pnum, ec_hdr); if (err) goto out_free; e->ec = ec; spin_lock(&ubi->wl_lock); if (e->ec > ubi->max_ec) ubi->max_ec = e->ec; spin_unlock(&ubi->wl_lock); out_free: kfree(ec_hdr); return err; } /** * serve_prot_queue - check if it is time to stop protecting PEBs. * @ubi: UBI device description object * * This function is called after each erase operation and removes PEBs from the * tail of the protection queue. These PEBs have been protected for long enough * and should be moved to the used tree. */ static void serve_prot_queue(struct ubi_device *ubi) { struct ubi_wl_entry *e, *tmp; int count; /* * There may be several protected physical eraseblock to remove, * process them all. */ repeat: count = 0; spin_lock(&ubi->wl_lock); list_for_each_entry_safe(e, tmp, &ubi->pq[ubi->pq_head], u.list) { dbg_wl("PEB %d EC %d protection over, move to used tree", e->pnum, e->ec); list_del(&e->u.list); wl_tree_add(e, &ubi->used); if (count++ > 32) { /* * Let's be nice and avoid holding the spinlock for * too long. */ spin_unlock(&ubi->wl_lock); cond_resched(); goto repeat; } } ubi->pq_head += 1; if (ubi->pq_head == UBI_PROT_QUEUE_LEN) ubi->pq_head = 0; ubi_assert(ubi->pq_head >= 0 && ubi->pq_head < UBI_PROT_QUEUE_LEN); spin_unlock(&ubi->wl_lock); } /** * __schedule_ubi_work - schedule a work. * @ubi: UBI device description object * @wrk: the work to schedule * * This function adds a work defined by @wrk to the tail of the pending works * list. Can only be used if ubi->work_sem is already held in read mode! */ static void __schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) { spin_lock(&ubi->wl_lock); list_add_tail(&wrk->list, &ubi->works); ubi_assert(ubi->works_count >= 0); ubi->works_count += 1; if (ubi->thread_enabled && !ubi_dbg_is_bgt_disabled(ubi)) wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); } /** * schedule_ubi_work - schedule a work. * @ubi: UBI device description object * @wrk: the work to schedule * * This function adds a work defined by @wrk to the tail of the pending works * list. */ static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) { down_read(&ubi->work_sem); __schedule_ubi_work(ubi, wrk); up_read(&ubi->work_sem); } static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, int shutdown); /** * schedule_erase - schedule an erase work. * @ubi: UBI device description object * @e: the WL entry of the physical eraseblock to erase * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured * @nested: denotes whether the work_sem is already held * * This function returns zero in case of success and a %-ENOMEM in case of * failure. */ static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int vol_id, int lnum, int torture, bool nested) { struct ubi_work *wl_wrk; ubi_assert(e); dbg_wl("schedule erasure of PEB %d, EC %d, torture %d", e->pnum, e->ec, torture); wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS); if (!wl_wrk) return -ENOMEM; wl_wrk->func = &erase_worker; wl_wrk->e = e; wl_wrk->vol_id = vol_id; wl_wrk->lnum = lnum; wl_wrk->torture = torture; if (nested) __schedule_ubi_work(ubi, wl_wrk); else schedule_ubi_work(ubi, wl_wrk); return 0; } static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk); /** * do_sync_erase - run the erase worker synchronously. * @ubi: UBI device description object * @e: the WL entry of the physical eraseblock to erase * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @torture: if the physical eraseblock has to be tortured * */ static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int vol_id, int lnum, int torture) { struct ubi_work wl_wrk; dbg_wl("sync erase of PEB %i", e->pnum); wl_wrk.e = e; wl_wrk.vol_id = vol_id; wl_wrk.lnum = lnum; wl_wrk.torture = torture; return __erase_worker(ubi, &wl_wrk); } static int ensure_wear_leveling(struct ubi_device *ubi, int nested); /** * wear_leveling_worker - wear-leveling worker function. * @ubi: UBI device description object * @wrk: the work object * @shutdown: non-zero if the worker has to free memory and exit * because the WL-subsystem is shutting down * * This function copies a more worn out physical eraseblock to a less worn out * one. Returns zero in case of success and a negative error code in case of * failure. */ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, int shutdown) { int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0; int erase = 0, keep = 0, vol_id = -1, lnum = -1; struct ubi_wl_entry *e1, *e2; struct ubi_vid_io_buf *vidb; struct ubi_vid_hdr *vid_hdr; int dst_leb_clean = 0; kfree(wrk); if (shutdown) return 0; vidb = ubi_alloc_vid_buf(ubi, GFP_NOFS); if (!vidb) return -ENOMEM; vid_hdr = ubi_get_vid_hdr(vidb); down_read(&ubi->fm_eba_sem); mutex_lock(&ubi->move_mutex); spin_lock(&ubi->wl_lock); ubi_assert(!ubi->move_from && !ubi->move_to); ubi_assert(!ubi->move_to_put); #ifdef CONFIG_MTD_UBI_FASTMAP if (!next_peb_for_wl(ubi, true) || #else if (!ubi->free.rb_node || #endif (!ubi->used.rb_node && !ubi->scrub.rb_node)) { /* * No free physical eraseblocks? Well, they must be waiting in * the queue to be erased. Cancel movement - it will be * triggered again when a free physical eraseblock appears. * * No used physical eraseblocks? They must be temporarily * protected from being moved. They will be moved to the * @ubi->used tree later and the wear-leveling will be * triggered again. */ dbg_wl("cancel WL, a list is empty: free %d, used %d", !ubi->free.rb_node, !ubi->used.rb_node); goto out_cancel; } #ifdef CONFIG_MTD_UBI_FASTMAP e1 = find_anchor_wl_entry(&ubi->used); if (e1 && ubi->fm_anchor && (ubi->fm_anchor->ec - e1->ec >= UBI_WL_THRESHOLD)) { ubi->fm_do_produce_anchor = 1; /* * fm_anchor is no longer considered a good anchor. * NULL assignment also prevents multiple wear level checks * of this PEB. */ wl_tree_add(ubi->fm_anchor, &ubi->free); ubi->fm_anchor = NULL; ubi->free_count++; } if (ubi->fm_do_produce_anchor) { if (!e1) goto out_cancel; e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; self_check_in_wl_tree(ubi, e1, &ubi->used); rb_erase(&e1->u.rb, &ubi->used); dbg_wl("anchor-move PEB %d to PEB %d", e1->pnum, e2->pnum); ubi->fm_do_produce_anchor = 0; } else if (!ubi->scrub.rb_node) { #else if (!ubi->scrub.rb_node) { #endif /* * Now pick the least worn-out used physical eraseblock and a * highly worn-out free physical eraseblock. If the erase * counters differ much enough, start wear-leveling. */ e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) { dbg_wl("no WL needed: min used EC %d, max free EC %d", e1->ec, e2->ec); /* Give the unused PEB back */ wl_tree_add(e2, &ubi->free); ubi->free_count++; goto out_cancel; } self_check_in_wl_tree(ubi, e1, &ubi->used); rb_erase(&e1->u.rb, &ubi->used); dbg_wl("move PEB %d EC %d to PEB %d EC %d", e1->pnum, e1->ec, e2->pnum, e2->ec); } else { /* Perform scrubbing */ scrubbing = 1; e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb); e2 = get_peb_for_wl(ubi); if (!e2) goto out_cancel; self_check_in_wl_tree(ubi, e1, &ubi->scrub); rb_erase(&e1->u.rb, &ubi->scrub); dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum); } ubi->move_from = e1; ubi->move_to = e2; spin_unlock(&ubi->wl_lock); /* * Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum. * We so far do not know which logical eraseblock our physical * eraseblock (@e1) belongs to. We have to read the volume identifier * header first. * * Note, we are protected from this PEB being unmapped and erased. The * 'ubi_wl_put_peb()' would wait for moving to be finished if the PEB * which is being moved was unmapped. */ err = ubi_io_read_vid_hdr(ubi, e1->pnum, vidb, 0); if (err && err != UBI_IO_BITFLIPS) { dst_leb_clean = 1; if (err == UBI_IO_FF) { /* * We are trying to move PEB without a VID header. UBI * always write VID headers shortly after the PEB was * given, so we have a situation when it has not yet * had a chance to write it, because it was preempted. * So add this PEB to the protection queue so far, * because presumably more data will be written there * (including the missing VID header), and then we'll * move it. */ dbg_wl("PEB %d has no VID header", e1->pnum); protect = 1; goto out_not_moved; } else if (err == UBI_IO_FF_BITFLIPS) { /* * The same situation as %UBI_IO_FF, but bit-flips were * detected. It is better to schedule this PEB for * scrubbing. */ dbg_wl("PEB %d has no VID header but has bit-flips", e1->pnum); scrubbing = 1; goto out_not_moved; } else if (ubi->fast_attach && err == UBI_IO_BAD_HDR_EBADMSG) { /* * While a full scan would detect interrupted erasures * at attach time we can face them here when attached from * Fastmap. */ dbg_wl("PEB %d has ECC errors, maybe from an interrupted erasure", e1->pnum); erase = 1; goto out_not_moved; } ubi_err(ubi, "error %d while reading VID header from PEB %d", err, e1->pnum); goto out_error; } vol_id = be32_to_cpu(vid_hdr->vol_id); lnum = be32_to_cpu(vid_hdr->lnum); err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vidb); if (err) { if (err == MOVE_CANCEL_RACE) { /* * The LEB has not been moved because the volume is * being deleted or the PEB has been put meanwhile. We * should prevent this PEB from being selected for * wear-leveling movement again, so put it to the * protection queue. */ protect = 1; dst_leb_clean = 1; goto out_not_moved; } if (err == MOVE_RETRY) { /* * For source PEB: * 1. The scrubbing is set for scrub type PEB, it will * be put back into ubi->scrub list. * 2. Non-scrub type PEB will be put back into ubi->used * list. */ keep = 1; dst_leb_clean = 1; goto out_not_moved; } if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR || err == MOVE_TARGET_RD_ERR) { /* * Target PEB had bit-flips or write error - torture it. */ torture = 1; keep = 1; goto out_not_moved; } if (err == MOVE_SOURCE_RD_ERR) { /* * An error happened while reading the source PEB. Do * not switch to R/O mode in this case, and give the * upper layers a possibility to recover from this, * e.g. by unmapping corresponding LEB. Instead, just * put this PEB to the @ubi->erroneous list to prevent * UBI from trying to move it over and over again. */ if (ubi->erroneous_peb_count > ubi->max_erroneous) { ubi_err(ubi, "too many erroneous eraseblocks (%d)", ubi->erroneous_peb_count); goto out_error; } dst_leb_clean = 1; erroneous = 1; goto out_not_moved; } if (err < 0) goto out_error; ubi_assert(0); } /* The PEB has been successfully moved */ if (scrubbing) ubi_msg(ubi, "scrubbed PEB %d (LEB %d:%d), data moved to PEB %d", e1->pnum, vol_id, lnum, e2->pnum); ubi_free_vid_buf(vidb); spin_lock(&ubi->wl_lock); if (!ubi->move_to_put) { wl_tree_add(e2, &ubi->used); e2 = NULL; } ubi->move_from = ubi->move_to = NULL; ubi->move_to_put = ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); err = do_sync_erase(ubi, e1, vol_id, lnum, 0); if (err) { if (e2) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e2); spin_unlock(&ubi->wl_lock); } goto out_ro; } if (e2) { /* * Well, the target PEB was put meanwhile, schedule it for * erasure. */ dbg_wl("PEB %d (LEB %d:%d) was put meanwhile, erase", e2->pnum, vol_id, lnum); err = do_sync_erase(ubi, e2, vol_id, lnum, 0); if (err) goto out_ro; } dbg_wl("done"); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); return 0; /* * For some reasons the LEB was not moved, might be an error, might be * something else. @e1 was not changed, so return it back. @e2 might * have been changed, schedule it for erasure. */ out_not_moved: if (vol_id != -1) dbg_wl("cancel moving PEB %d (LEB %d:%d) to PEB %d (%d)", e1->pnum, vol_id, lnum, e2->pnum, err); else dbg_wl("cancel moving PEB %d to PEB %d (%d)", e1->pnum, e2->pnum, err); spin_lock(&ubi->wl_lock); if (protect) prot_queue_add(ubi, e1); else if (erroneous) { wl_tree_add(e1, &ubi->erroneous); ubi->erroneous_peb_count += 1; } else if (scrubbing) wl_tree_add(e1, &ubi->scrub); else if (keep) wl_tree_add(e1, &ubi->used); if (dst_leb_clean) { wl_tree_add(e2, &ubi->free); ubi->free_count++; } ubi_assert(!ubi->move_to_put); ubi->move_from = ubi->move_to = NULL; ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); ubi_free_vid_buf(vidb); if (dst_leb_clean) { ensure_wear_leveling(ubi, 1); } else { err = do_sync_erase(ubi, e2, vol_id, lnum, torture); if (err) goto out_ro; } if (erase) { err = do_sync_erase(ubi, e1, vol_id, lnum, 1); if (err) goto out_ro; } mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); return 0; out_error: if (vol_id != -1) ubi_err(ubi, "error %d while moving PEB %d to PEB %d", err, e1->pnum, e2->pnum); else ubi_err(ubi, "error %d while moving PEB %d (LEB %d:%d) to PEB %d", err, e1->pnum, vol_id, lnum, e2->pnum); spin_lock(&ubi->wl_lock); ubi->move_from = ubi->move_to = NULL; ubi->move_to_put = ubi->wl_scheduled = 0; wl_entry_destroy(ubi, e1); wl_entry_destroy(ubi, e2); spin_unlock(&ubi->wl_lock); ubi_free_vid_buf(vidb); out_ro: ubi_ro_mode(ubi); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); ubi_assert(err != 0); return err < 0 ? err : -EIO; out_cancel: ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); mutex_unlock(&ubi->move_mutex); up_read(&ubi->fm_eba_sem); ubi_free_vid_buf(vidb); return 0; } /** * ensure_wear_leveling - schedule wear-leveling if it is needed. * @ubi: UBI device description object * @nested: set to non-zero if this function is called from UBI worker * * This function checks if it is time to start wear-leveling and schedules it * if yes. This function returns zero in case of success and a negative error * code in case of failure. */ static int ensure_wear_leveling(struct ubi_device *ubi, int nested) { int err = 0; struct ubi_work *wrk; spin_lock(&ubi->wl_lock); if (ubi->wl_scheduled) /* Wear-leveling is already in the work queue */ goto out_unlock; /* * If the ubi->scrub tree is not empty, scrubbing is needed, and the * WL worker has to be scheduled anyway. */ if (!ubi->scrub.rb_node) { #ifdef CONFIG_MTD_UBI_FASTMAP if (!need_wear_leveling(ubi)) goto out_unlock; #else struct ubi_wl_entry *e1; struct ubi_wl_entry *e2; if (!ubi->used.rb_node || !ubi->free.rb_node) /* No physical eraseblocks - no deal */ goto out_unlock; /* * We schedule wear-leveling only if the difference between the * lowest erase counter of used physical eraseblocks and a high * erase counter of free physical eraseblocks is greater than * %UBI_WL_THRESHOLD. */ e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF, 0); if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) goto out_unlock; #endif dbg_wl("schedule wear-leveling"); } else dbg_wl("schedule scrubbing"); ubi->wl_scheduled = 1; spin_unlock(&ubi->wl_lock); wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS); if (!wrk) { err = -ENOMEM; goto out_cancel; } wrk->func = &wear_leveling_worker; if (nested) __schedule_ubi_work(ubi, wrk); else schedule_ubi_work(ubi, wrk); return err; out_cancel: spin_lock(&ubi->wl_lock); ubi->wl_scheduled = 0; out_unlock: spin_unlock(&ubi->wl_lock); return err; } /** * __erase_worker - physical eraseblock erase worker function. * @ubi: UBI device description object * @wl_wrk: the work object * * This function erases a physical eraseblock and perform torture testing if * needed. It also takes care about marking the physical eraseblock bad if * needed. Returns zero in case of success and a negative error code in case of * failure. */ static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk) { struct ubi_wl_entry *e = wl_wrk->e; int pnum = e->pnum; int vol_id = wl_wrk->vol_id; int lnum = wl_wrk->lnum; int err, available_consumed = 0; dbg_wl("erase PEB %d EC %d LEB %d:%d", pnum, e->ec, wl_wrk->vol_id, wl_wrk->lnum); err = ubi_sync_erase(ubi, e, wl_wrk->torture); if (!err) { spin_lock(&ubi->wl_lock); if (!ubi->fm_disabled && !ubi->fm_anchor && e->pnum < UBI_FM_MAX_START) { /* * Abort anchor production, if needed it will be * enabled again in the wear leveling started below. */ ubi->fm_anchor = e; ubi->fm_do_produce_anchor = 0; } else { wl_tree_add(e, &ubi->free); ubi->free_count++; } spin_unlock(&ubi->wl_lock); /* * One more erase operation has happened, take care about * protected physical eraseblocks. */ serve_prot_queue(ubi); /* And take care about wear-leveling */ err = ensure_wear_leveling(ubi, 1); return err; } ubi_err(ubi, "failed to erase PEB %d, error %d", pnum, err); if (err == -EINTR || err == -ENOMEM || err == -EAGAIN || err == -EBUSY) { int err1; /* Re-schedule the LEB for erasure */ err1 = schedule_erase(ubi, e, vol_id, lnum, 0, true); if (err1) { spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); spin_unlock(&ubi->wl_lock); err = err1; goto out_ro; } return err; } spin_lock(&ubi->wl_lock); wl_entry_destroy(ubi, e); spin_unlock(&ubi->wl_lock); if (err != -EIO) /* * If this is not %-EIO, we have no idea what to do. Scheduling * this physical eraseblock for erasure again would cause * errors again and again. Well, lets switch to R/O mode. */ goto out_ro; /* It is %-EIO, the PEB went bad */ if (!ubi->bad_allowed) { ubi_err(ubi, "bad physical eraseblock %d detected", pnum); goto out_ro; } spin_lock(&ubi->volumes_lock); if (ubi->beb_rsvd_pebs == 0) { if (ubi->avail_pebs == 0) { spin_unlock(&ubi->volumes_lock); ubi_err(ubi, "no reserved/available physical eraseblocks"); goto out_ro; } ubi->avail_pebs -= 1; available_consumed = 1; } spin_unlock(&ubi->volumes_lock); ubi_msg(ubi, "mark PEB %d as bad", pnum); err = ubi_io_mark_bad(ubi, pnum); if (err) goto out_ro; spin_lock(&ubi->volumes_lock); if (ubi->beb_rsvd_pebs > 0) { if (available_consumed) { /* * The amount of reserved PEBs increased since we last * checked. */ ubi->avail_pebs += 1; available_consumed = 0; } ubi->beb_rsvd_pebs -= 1; } ubi->bad_peb_count += 1; ubi->good_peb_count -= 1; ubi_calculate_reserved(ubi); if (available_consumed) ubi_warn(ubi, "no PEBs in the reserved pool, used an available PEB"); else if (ubi->beb_rsvd_pebs) ubi_msg(ubi, "%d PEBs left in the reserve", ubi->beb_rsvd_pebs); else ubi_warn(ubi, "last PEB from the reserve was used"); spin_unlock(&ubi->volumes_lock); return err; out_ro: if (available_consumed) { spin_lock(&ubi->volumes_lock); ubi->avail_pebs += 1; spin_unlock(&ubi->volumes_lock); } ubi_ro_mode(ubi); return err; } static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, int shutdown) { int ret; if (shutdown) { struct ubi_wl_entry *e = wl_wrk->e; dbg_wl("cancel erasure of PEB %d EC %d", e->pnum, e->ec); kfree(wl_wrk); wl_entry_destroy(ubi, e); return 0; } ret = __erase_worker(ubi, wl_wrk); kfree(wl_wrk); return ret; } /** * ubi_wl_put_peb - return a PEB to the wear-leveling sub-system. * @ubi: UBI device description object * @vol_id: the volume ID that last used this PEB * @lnum: the last used logical eraseblock number for the PEB * @pnum: physical eraseblock to return * @torture: if this physical eraseblock has to be tortured * * This function is called to return physical eraseblock @pnum to the pool of * free physical eraseblocks. The @torture flag has to be set if an I/O error * occurred to this @pnum and it has to be tested. This function returns zero * in case of success, and a negative error code in case of failure. */ int ubi_wl_put_peb(struct ubi_device *ubi, int vol_id, int lnum, int pnum, int torture) { int err; struct ubi_wl_entry *e; dbg_wl("PEB %d", pnum); ubi_assert(pnum >= 0); ubi_assert(pnum < ubi->peb_count); down_read(&ubi->fm_protect); retry: spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (!e) { /* * This wl entry has been removed for some errors by other * process (eg. wear leveling worker), corresponding process * (except __erase_worker, which cannot concurrent with * ubi_wl_put_peb) will set ubi ro_mode at the same time, * just ignore this wl entry. */ spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return 0; } if (e == ubi->move_from) { /* * User is putting the physical eraseblock which was selected to * be moved. It will be scheduled for erasure in the * wear-leveling worker. */ dbg_wl("PEB %d is being moved, wait", pnum); spin_unlock(&ubi->wl_lock); /* Wait for the WL worker by taking the @ubi->move_mutex */ mutex_lock(&ubi->move_mutex); mutex_unlock(&ubi->move_mutex); goto retry; } else if (e == ubi->move_to) { /* * User is putting the physical eraseblock which was selected * as the target the data is moved to. It may happen if the EBA * sub-system already re-mapped the LEB in 'ubi_eba_copy_leb()' * but the WL sub-system has not put the PEB to the "used" tree * yet, but it is about to do this. So we just set a flag which * will tell the WL worker that the PEB is not needed anymore * and should be scheduled for erasure. */ dbg_wl("PEB %d is the target of data moving", pnum); ubi_assert(!ubi->move_to_put); ubi->move_to_put = 1; spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return 0; } else { if (in_wl_tree(e, &ubi->used)) { self_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else if (in_wl_tree(e, &ubi->scrub)) { self_check_in_wl_tree(ubi, e, &ubi->scrub); rb_erase(&e->u.rb, &ubi->scrub); } else if (in_wl_tree(e, &ubi->erroneous)) { self_check_in_wl_tree(ubi, e, &ubi->erroneous); rb_erase(&e->u.rb, &ubi->erroneous); ubi->erroneous_peb_count -= 1; ubi_assert(ubi->erroneous_peb_count >= 0); /* Erroneous PEBs should be tortured */ torture = 1; } else { err = prot_queue_del(ubi, e->pnum); if (err) { ubi_err(ubi, "PEB %d not found", pnum); ubi_ro_mode(ubi); spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_protect); return err; } } } spin_unlock(&ubi->wl_lock); err = schedule_erase(ubi, e, vol_id, lnum, torture, false); if (err) { spin_lock(&ubi->wl_lock); wl_tree_add(e, &ubi->used); spin_unlock(&ubi->wl_lock); } up_read(&ubi->fm_protect); return err; } /** * ubi_wl_scrub_peb - schedule a physical eraseblock for scrubbing. * @ubi: UBI device description object * @pnum: the physical eraseblock to schedule * * If a bit-flip in a physical eraseblock is detected, this physical eraseblock * needs scrubbing. This function schedules a physical eraseblock for * scrubbing which is done in background. This function returns zero in case of * success and a negative error code in case of failure. */ int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum) { struct ubi_wl_entry *e; ubi_msg(ubi, "schedule PEB %d for scrubbing", pnum); retry: spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (e == ubi->move_from || in_wl_tree(e, &ubi->scrub) || in_wl_tree(e, &ubi->erroneous)) { spin_unlock(&ubi->wl_lock); return 0; } if (e == ubi->move_to) { /* * This physical eraseblock was used to move data to. The data * was moved but the PEB was not yet inserted to the proper * tree. We should just wait a little and let the WL worker * proceed. */ spin_unlock(&ubi->wl_lock); dbg_wl("the PEB %d is not in proper tree, retry", pnum); yield(); goto retry; } if (in_wl_tree(e, &ubi->used)) { self_check_in_wl_tree(ubi, e, &ubi->used); rb_erase(&e->u.rb, &ubi->used); } else { int err; err = prot_queue_del(ubi, e->pnum); if (err) { ubi_err(ubi, "PEB %d not found", pnum); ubi_ro_mode(ubi); spin_unlock(&ubi->wl_lock); return err; } } wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); /* * Technically scrubbing is the same as wear-leveling, so it is done * by the WL worker. */ return ensure_wear_leveling(ubi, 0); } /** * ubi_wl_flush - flush all pending works. * @ubi: UBI device description object * @vol_id: the volume id to flush for * @lnum: the logical eraseblock number to flush for * * This function executes all pending works for a particular volume id / * logical eraseblock number pair. If either value is set to %UBI_ALL, then it * acts as a wildcard for all of the corresponding volume numbers or logical * eraseblock numbers. It returns zero in case of success and a negative error * code in case of failure. */ int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum) { int err = 0; int found = 1; /* * Erase while the pending works queue is not empty, but not more than * the number of currently pending works. */ dbg_wl("flush pending work for LEB %d:%d (%d pending works)", vol_id, lnum, ubi->works_count); while (found) { struct ubi_work *wrk, *tmp; found = 0; down_read(&ubi->work_sem); spin_lock(&ubi->wl_lock); list_for_each_entry_safe(wrk, tmp, &ubi->works, list) { if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) && (lnum == UBI_ALL || wrk->lnum == lnum)) { list_del(&wrk->list); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); spin_unlock(&ubi->wl_lock); err = wrk->func(ubi, wrk, 0); if (err) { up_read(&ubi->work_sem); return err; } spin_lock(&ubi->wl_lock); found = 1; break; } } spin_unlock(&ubi->wl_lock); up_read(&ubi->work_sem); } /* * Make sure all the works which have been done in parallel are * finished. */ down_write(&ubi->work_sem); up_write(&ubi->work_sem); return err; } static bool scrub_possible(struct ubi_device *ubi, struct ubi_wl_entry *e) { if (in_wl_tree(e, &ubi->scrub)) return false; else if (in_wl_tree(e, &ubi->erroneous)) return false; else if (ubi->move_from == e) return false; else if (ubi->move_to == e) return false; return true; } /** * ubi_bitflip_check - Check an eraseblock for bitflips and scrub it if needed. * @ubi: UBI device description object * @pnum: the physical eraseblock to schedule * @force: don't read the block, assume bitflips happened and take action. * * This function reads the given eraseblock and checks if bitflips occured. * In case of bitflips, the eraseblock is scheduled for scrubbing. * If scrubbing is forced with @force, the eraseblock is not read, * but scheduled for scrubbing right away. * * Returns: * %EINVAL, PEB is out of range * %ENOENT, PEB is no longer used by UBI * %EBUSY, PEB cannot be checked now or a check is currently running on it * %EAGAIN, bit flips happened but scrubbing is currently not possible * %EUCLEAN, bit flips happened and PEB is scheduled for scrubbing * %0, no bit flips detected */ int ubi_bitflip_check(struct ubi_device *ubi, int pnum, int force) { int err = 0; struct ubi_wl_entry *e; if (pnum < 0 || pnum >= ubi->peb_count) { err = -EINVAL; goto out; } /* * Pause all parallel work, otherwise it can happen that the * erase worker frees a wl entry under us. */ down_write(&ubi->work_sem); /* * Make sure that the wl entry does not change state while * inspecting it. */ spin_lock(&ubi->wl_lock); e = ubi->lookuptbl[pnum]; if (!e) { spin_unlock(&ubi->wl_lock); err = -ENOENT; goto out_resume; } /* * Does it make sense to check this PEB? */ if (!scrub_possible(ubi, e)) { spin_unlock(&ubi->wl_lock); err = -EBUSY; goto out_resume; } spin_unlock(&ubi->wl_lock); if (!force) { mutex_lock(&ubi->buf_mutex); err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size); mutex_unlock(&ubi->buf_mutex); } if (force || err == UBI_IO_BITFLIPS) { /* * Okay, bit flip happened, let's figure out what we can do. */ spin_lock(&ubi->wl_lock); /* * Recheck. We released wl_lock, UBI might have killed the * wl entry under us. */ e = ubi->lookuptbl[pnum]; if (!e) { spin_unlock(&ubi->wl_lock); err = -ENOENT; goto out_resume; } /* * Need to re-check state */ if (!scrub_possible(ubi, e)) { spin_unlock(&ubi->wl_lock); err = -EBUSY; goto out_resume; } if (in_pq(ubi, e)) { prot_queue_del(ubi, e->pnum); wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); err = ensure_wear_leveling(ubi, 1); } else if (in_wl_tree(e, &ubi->used)) { rb_erase(&e->u.rb, &ubi->used); wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); err = ensure_wear_leveling(ubi, 1); } else if (in_wl_tree(e, &ubi->free)) { rb_erase(&e->u.rb, &ubi->free); ubi->free_count--; spin_unlock(&ubi->wl_lock); /* * This PEB is empty we can schedule it for * erasure right away. No wear leveling needed. */ err = schedule_erase(ubi, e, UBI_UNKNOWN, UBI_UNKNOWN, force ? 0 : 1, true); } else { spin_unlock(&ubi->wl_lock); err = -EAGAIN; } if (!err && !force) err = -EUCLEAN; } else { err = 0; } out_resume: up_write(&ubi->work_sem); out: return err; } /** * tree_destroy - destroy an RB-tree. * @ubi: UBI device description object * @root: the root of the tree to destroy */ static void tree_destroy(struct ubi_device *ubi, struct rb_root *root) { struct rb_node *rb; struct ubi_wl_entry *e; rb = root->rb_node; while (rb) { if (rb->rb_left) rb = rb->rb_left; else if (rb->rb_right) rb = rb->rb_right; else { e = rb_entry(rb, struct ubi_wl_entry, u.rb); rb = rb_parent(rb); if (rb) { if (rb->rb_left == &e->u.rb) rb->rb_left = NULL; else rb->rb_right = NULL; } wl_entry_destroy(ubi, e); } } } /** * ubi_thread - UBI background thread. * @u: the UBI device description object pointer */ int ubi_thread(void *u) { int failures = 0; struct ubi_device *ubi = u; ubi_msg(ubi, "background thread \"%s\" started, PID %d", ubi->bgt_name, task_pid_nr(current)); set_freezable(); for (;;) { int err; if (kthread_should_stop()) break; if (try_to_freeze()) continue; spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works) || ubi->ro_mode || !ubi->thread_enabled || ubi_dbg_is_bgt_disabled(ubi)) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&ubi->wl_lock); /* * Check kthread_should_stop() after we set the task * state to guarantee that we either see the stop bit * and exit or the task state is reset to runnable such * that it's not scheduled out indefinitely and detects * the stop bit at kthread_should_stop(). */ if (kthread_should_stop()) { set_current_state(TASK_RUNNING); break; } schedule(); continue; } spin_unlock(&ubi->wl_lock); err = do_work(ubi, NULL); if (err) { ubi_err(ubi, "%s: work failed with error code %d", ubi->bgt_name, err); if (failures++ > WL_MAX_FAILURES) { /* * Too many failures, disable the thread and * switch to read-only mode. */ ubi_msg(ubi, "%s: %d consecutive failures", ubi->bgt_name, WL_MAX_FAILURES); ubi_ro_mode(ubi); ubi->thread_enabled = 0; continue; } } else failures = 0; cond_resched(); } dbg_wl("background thread \"%s\" is killed", ubi->bgt_name); ubi->thread_enabled = 0; return 0; } /** * shutdown_work - shutdown all pending works. * @ubi: UBI device description object */ static void shutdown_work(struct ubi_device *ubi) { while (!list_empty(&ubi->works)) { struct ubi_work *wrk; wrk = list_entry(ubi->works.next, struct ubi_work, list); list_del(&wrk->list); wrk->func(ubi, wrk, 1); ubi->works_count -= 1; ubi_assert(ubi->works_count >= 0); } } /** * erase_aeb - erase a PEB given in UBI attach info PEB * @ubi: UBI device description object * @aeb: UBI attach info PEB * @sync: If true, erase synchronously. Otherwise schedule for erasure */ static int erase_aeb(struct ubi_device *ubi, struct ubi_ainf_peb *aeb, bool sync) { struct ubi_wl_entry *e; int err; e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) return -ENOMEM; e->pnum = aeb->pnum; e->ec = aeb->ec; ubi->lookuptbl[e->pnum] = e; if (sync) { err = ubi_sync_erase(ubi, e, false); if (err) goto out_free; wl_tree_add(e, &ubi->free); ubi->free_count++; } else { err = schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0, false); if (err) goto out_free; } return 0; out_free: wl_entry_destroy(ubi, e); return err; } /** * ubi_wl_init - initialize the WL sub-system using attaching information. * @ubi: UBI device description object * @ai: attaching information * * This function returns zero in case of success, and a negative error code in * case of failure. */ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai) { int err, i, reserved_pebs, found_pebs = 0; struct rb_node *rb1, *rb2; struct ubi_ainf_volume *av; struct ubi_ainf_peb *aeb, *tmp; struct ubi_wl_entry *e; ubi->used = ubi->erroneous = ubi->free = ubi->scrub = RB_ROOT; spin_lock_init(&ubi->wl_lock); mutex_init(&ubi->move_mutex); init_rwsem(&ubi->work_sem); ubi->max_ec = ai->max_ec; INIT_LIST_HEAD(&ubi->works); sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num); err = -ENOMEM; ubi->lookuptbl = kcalloc(ubi->peb_count, sizeof(void *), GFP_KERNEL); if (!ubi->lookuptbl) return err; for (i = 0; i < UBI_PROT_QUEUE_LEN; i++) INIT_LIST_HEAD(&ubi->pq[i]); ubi->pq_head = 0; ubi->free_count = 0; list_for_each_entry_safe(aeb, tmp, &ai->erase, u.list) { cond_resched(); err = erase_aeb(ubi, aeb, false); if (err) goto out_free; found_pebs++; } list_for_each_entry(aeb, &ai->free, u.list) { cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) { err = -ENOMEM; goto out_free; } e->pnum = aeb->pnum; e->ec = aeb->ec; ubi_assert(e->ec >= 0); wl_tree_add(e, &ubi->free); ubi->free_count++; ubi->lookuptbl[e->pnum] = e; found_pebs++; } ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) { ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb) { cond_resched(); e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) { err = -ENOMEM; goto out_free; } e->pnum = aeb->pnum; e->ec = aeb->ec; ubi->lookuptbl[e->pnum] = e; if (!aeb->scrub) { dbg_wl("add PEB %d EC %d to the used tree", e->pnum, e->ec); wl_tree_add(e, &ubi->used); } else { dbg_wl("add PEB %d EC %d to the scrub tree", e->pnum, e->ec); wl_tree_add(e, &ubi->scrub); } found_pebs++; } } list_for_each_entry(aeb, &ai->fastmap, u.list) { cond_resched(); e = ubi_find_fm_block(ubi, aeb->pnum); if (e) { ubi_assert(!ubi->lookuptbl[e->pnum]); ubi->lookuptbl[e->pnum] = e; } else { bool sync = false; /* * Usually old Fastmap PEBs are scheduled for erasure * and we don't have to care about them but if we face * an power cut before scheduling them we need to * take care of them here. */ if (ubi->lookuptbl[aeb->pnum]) continue; /* * The fastmap update code might not find a free PEB for * writing the fastmap anchor to and then reuses the * current fastmap anchor PEB. When this PEB gets erased * and a power cut happens before it is written again we * must make sure that the fastmap attach code doesn't * find any outdated fastmap anchors, hence we erase the * outdated fastmap anchor PEBs synchronously here. */ if (aeb->vol_id == UBI_FM_SB_VOLUME_ID) sync = true; err = erase_aeb(ubi, aeb, sync); if (err) goto out_free; } found_pebs++; } dbg_wl("found %i PEBs", found_pebs); ubi_assert(ubi->good_peb_count == found_pebs); reserved_pebs = WL_RESERVED_PEBS; ubi_fastmap_init(ubi, &reserved_pebs); if (ubi->avail_pebs < reserved_pebs) { ubi_err(ubi, "no enough physical eraseblocks (%d, need %d)", ubi->avail_pebs, reserved_pebs); if (ubi->corr_peb_count) ubi_err(ubi, "%d PEBs are corrupted and not used", ubi->corr_peb_count); err = -ENOSPC; goto out_free; } ubi->avail_pebs -= reserved_pebs; ubi->rsvd_pebs += reserved_pebs; /* Schedule wear-leveling if needed */ err = ensure_wear_leveling(ubi, 0); if (err) goto out_free; #ifdef CONFIG_MTD_UBI_FASTMAP if (!ubi->ro_mode && !ubi->fm_disabled) ubi_ensure_anchor_pebs(ubi); #endif return 0; out_free: shutdown_work(ubi); tree_destroy(ubi, &ubi->used); tree_destroy(ubi, &ubi->free); tree_destroy(ubi, &ubi->scrub); kfree(ubi->lookuptbl); return err; } /** * protection_queue_destroy - destroy the protection queue. * @ubi: UBI device description object */ static void protection_queue_destroy(struct ubi_device *ubi) { int i; struct ubi_wl_entry *e, *tmp; for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) { list_for_each_entry_safe(e, tmp, &ubi->pq[i], u.list) { list_del(&e->u.list); wl_entry_destroy(ubi, e); } } } /** * ubi_wl_close - close the wear-leveling sub-system. * @ubi: UBI device description object */ void ubi_wl_close(struct ubi_device *ubi) { dbg_wl("close the WL sub-system"); ubi_fastmap_close(ubi); shutdown_work(ubi); protection_queue_destroy(ubi); tree_destroy(ubi, &ubi->used); tree_destroy(ubi, &ubi->erroneous); tree_destroy(ubi, &ubi->free); tree_destroy(ubi, &ubi->scrub); kfree(ubi->lookuptbl); } /** * self_check_ec - make sure that the erase counter of a PEB is correct. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * @ec: the erase counter to check * * This function returns zero if the erase counter of physical eraseblock @pnum * is equivalent to @ec, and a negative error code if not or if an error * occurred. */ static int self_check_ec(struct ubi_device *ubi, int pnum, int ec) { int err; long long read_ec; struct ubi_ec_hdr *ec_hdr; if (!ubi_dbg_chk_gen(ubi)) return 0; ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS); if (!ec_hdr) return -ENOMEM; err = ubi_io_read_ec_hdr(ubi, pnum, ec_hdr, 0); if (err && err != UBI_IO_BITFLIPS) { /* The header does not have to exist */ err = 0; goto out_free; } read_ec = be64_to_cpu(ec_hdr->ec); if (ec != read_ec && read_ec - ec > 1) { ubi_err(ubi, "self-check failed for PEB %d", pnum); ubi_err(ubi, "read EC is %lld, should be %d", read_ec, ec); dump_stack(); err = 1; } else err = 0; out_free: kfree(ec_hdr); return err; } /** * self_check_in_wl_tree - check that wear-leveling entry is in WL RB-tree. * @ubi: UBI device description object * @e: the wear-leveling entry to check * @root: the root of the tree * * This function returns zero if @e is in the @root RB-tree and %-EINVAL if it * is not. */ static int self_check_in_wl_tree(const struct ubi_device *ubi, struct ubi_wl_entry *e, struct rb_root *root) { if (!ubi_dbg_chk_gen(ubi)) return 0; if (in_wl_tree(e, root)) return 0; ubi_err(ubi, "self-check failed for PEB %d, EC %d, RB-tree %p ", e->pnum, e->ec, root); dump_stack(); return -EINVAL; } /** * self_check_in_pq - check if wear-leveling entry is in the protection * queue. * @ubi: UBI device description object * @e: the wear-leveling entry to check * * This function returns zero if @e is in @ubi->pq and %-EINVAL if it is not. */ static int self_check_in_pq(const struct ubi_device *ubi, struct ubi_wl_entry *e) { if (!ubi_dbg_chk_gen(ubi)) return 0; if (in_pq(ubi, e)) return 0; ubi_err(ubi, "self-check failed for PEB %d, EC %d, Protect queue", e->pnum, e->ec); dump_stack(); return -EINVAL; } #ifndef CONFIG_MTD_UBI_FASTMAP static struct ubi_wl_entry *get_peb_for_wl(struct ubi_device *ubi) { struct ubi_wl_entry *e; e = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF, 0); self_check_in_wl_tree(ubi, e, &ubi->free); ubi->free_count--; ubi_assert(ubi->free_count >= 0); rb_erase(&e->u.rb, &ubi->free); return e; } /** * produce_free_peb - produce a free physical eraseblock. * @ubi: UBI device description object * * This function tries to make a free PEB by means of synchronous execution of * pending works. This may be needed if, for example the background thread is * disabled. Returns zero in case of success and a negative error code in case * of failure. */ static int produce_free_peb(struct ubi_device *ubi) { int err; while (!ubi->free.rb_node && ubi->works_count) { spin_unlock(&ubi->wl_lock); dbg_wl("do one work synchronously"); err = do_work(ubi, NULL); spin_lock(&ubi->wl_lock); if (err) return err; } return 0; } /** * ubi_wl_get_peb - get a physical eraseblock. * @ubi: UBI device description object * * This function returns a physical eraseblock in case of success and a * negative error code in case of failure. * Returns with ubi->fm_eba_sem held in read mode! */ int ubi_wl_get_peb(struct ubi_device *ubi) { int err; struct ubi_wl_entry *e; retry: down_read(&ubi->fm_eba_sem); spin_lock(&ubi->wl_lock); if (!ubi->free.rb_node) { if (ubi->works_count == 0) { ubi_err(ubi, "no free eraseblocks"); ubi_assert(list_empty(&ubi->works)); spin_unlock(&ubi->wl_lock); return -ENOSPC; } err = produce_free_peb(ubi); if (err < 0) { spin_unlock(&ubi->wl_lock); return err; } spin_unlock(&ubi->wl_lock); up_read(&ubi->fm_eba_sem); goto retry; } e = wl_get_wle(ubi); prot_queue_add(ubi, e); spin_unlock(&ubi->wl_lock); err = ubi_self_check_all_ff(ubi, e->pnum, ubi->vid_hdr_aloffset, ubi->peb_size - ubi->vid_hdr_aloffset); if (err) { ubi_err(ubi, "new PEB %d does not contain all 0xFF bytes", e->pnum); return err; } return e->pnum; } #else #include "fastmap-wl.c" #endif
11 2649 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/xattr.h Extended attributes handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #ifndef _LINUX_XATTR_H #define _LINUX_XATTR_H #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/user_namespace.h> #include <uapi/linux/xattr.h> /* List of all open_how "versions". */ #define XATTR_ARGS_SIZE_VER0 16 /* sizeof first published struct */ #define XATTR_ARGS_SIZE_LATEST XATTR_ARGS_SIZE_VER0 struct inode; struct dentry; static inline bool is_posix_acl_xattr(const char *name) { return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); } /* * struct xattr_handler: When @name is set, match attributes with exactly that * name. When @prefix is set instead, match attributes with that prefix and * with a non-empty suffix. */ struct xattr_handler { const char *name; const char *prefix; int flags; /* fs private flags */ bool (*list)(struct dentry *dentry); int (*get)(const struct xattr_handler *, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; /** * xattr_handler_can_list - check whether xattr can be listed * @handler: handler for this type of xattr * @dentry: dentry whose inode xattr to list * * Determine whether the xattr associated with @dentry can be listed given * @handler. * * Return: true if xattr can be listed, false if not. */ static inline bool xattr_handler_can_list(const struct xattr_handler *handler, struct dentry *dentry) { return handler && (!handler->list || handler->list(dentry)); } const char *xattr_full_name(const struct xattr_handler *, const char *); struct xattr { const char *name; void *value; size_t value_len; }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, const char *, struct inode **); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int xattr_supports_user_prefix(struct inode *inode); static inline const char *xattr_prefix(const struct xattr_handler *handler) { return handler->prefix ?: handler->name; } struct simple_xattrs { struct rb_root rb_root; rwlock_t lock; }; struct simple_xattr { struct rb_node rb_node; char *name; size_t size; char value[]; }; void simple_xattrs_init(struct simple_xattrs *xattrs); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); #endif /* _LINUX_XATTR_H */
130 1 118 182 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "elevator.h" #include "blk-mq.h" #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) __blk_mq_sched_restart(hctx); } static inline bool bio_mergeable(struct bio *bio) { return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } } static inline void blk_mq_sched_requeue_request(struct request *rq) { if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } #endif
876 876 876 877 876 6553 6027 876 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .init = safesetid_security_init, .name = "safesetid", };
19 21 21 8 7 14 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 // SPDX-License-Identifier: GPL-2.0-or-later /* * Shared crypto simd helpers * * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * Copyright (c) 2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2019 Google LLC * * Based on aesni-intel_glue.c by: * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ /* * Shared crypto SIMD helpers. These functions dynamically create and register * an skcipher or AEAD algorithm that wraps another, internal algorithm. The * wrapper ensures that the internal algorithm is only executed in a context * where SIMD instructions are usable, i.e. where may_use_simd() returns true. * If SIMD is already usable, the wrapper directly calls the internal algorithm. * Otherwise it defers execution to a workqueue via cryptd. * * This is an alternative to the internal algorithm implementing a fallback for * the !may_use_simd() case itself. * * Note that the wrapper algorithm is asynchronous, i.e. it has the * CRYPTO_ALG_ASYNC flag set. Therefore it won't be found by users who * explicitly allocate a synchronous algorithm. */ #include <crypto/cryptd.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/preempt.h> #include <asm/simd.h> /* skcipher support */ struct simd_skcipher_alg { const char *ialg_name; struct skcipher_alg alg; }; struct simd_skcipher_ctx { struct cryptd_skcipher *cryptd_tfm; }; static int simd_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int key_len) { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child = &ctx->cryptd_tfm->base; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, key_len); } static int simd_skcipher_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_request *subreq; struct crypto_skcipher *child; subreq = skcipher_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_skcipher_child(ctx->cryptd_tfm); skcipher_request_set_tfm(subreq, child); return crypto_skcipher_encrypt(subreq); } static int simd_skcipher_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_request *subreq; struct crypto_skcipher *child; subreq = skcipher_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_skcipher_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_skcipher_child(ctx->cryptd_tfm); skcipher_request_set_tfm(subreq, child); return crypto_skcipher_decrypt(subreq); } static void simd_skcipher_exit(struct crypto_skcipher *tfm) { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); cryptd_free_skcipher(ctx->cryptd_tfm); } static int simd_skcipher_init(struct crypto_skcipher *tfm) { struct simd_skcipher_ctx *ctx = crypto_skcipher_ctx(tfm); struct cryptd_skcipher *cryptd_tfm; struct simd_skcipher_alg *salg; struct skcipher_alg *alg; unsigned reqsize; alg = crypto_skcipher_alg(tfm); salg = container_of(alg, struct simd_skcipher_alg, alg); cryptd_tfm = cryptd_alloc_skcipher(salg->ialg_name, CRYPTO_ALG_INTERNAL, CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; reqsize = crypto_skcipher_reqsize(cryptd_skcipher_child(cryptd_tfm)); reqsize = max(reqsize, crypto_skcipher_reqsize(&cryptd_tfm->base)); reqsize += sizeof(struct skcipher_request); crypto_skcipher_set_reqsize(tfm, reqsize); return 0; } struct simd_skcipher_alg *simd_skcipher_create_compat(struct skcipher_alg *ialg, const char *algname, const char *drvname, const char *basename) { struct simd_skcipher_alg *salg; struct skcipher_alg *alg; int err; salg = kzalloc(sizeof(*salg), GFP_KERNEL); if (!salg) { salg = ERR_PTR(-ENOMEM); goto out; } salg->ialg_name = basename; alg = &salg->alg; err = -ENAMETOOLONG; if (snprintf(alg->base.cra_name, CRYPTO_MAX_ALG_NAME, "%s", algname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; if (snprintf(alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", drvname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; alg->base.cra_flags = CRYPTO_ALG_ASYNC | (ialg->base.cra_flags & CRYPTO_ALG_INHERITED_FLAGS); alg->base.cra_priority = ialg->base.cra_priority; alg->base.cra_blocksize = ialg->base.cra_blocksize; alg->base.cra_alignmask = ialg->base.cra_alignmask; alg->base.cra_module = ialg->base.cra_module; alg->base.cra_ctxsize = sizeof(struct simd_skcipher_ctx); alg->ivsize = ialg->ivsize; alg->chunksize = ialg->chunksize; alg->min_keysize = ialg->min_keysize; alg->max_keysize = ialg->max_keysize; alg->init = simd_skcipher_init; alg->exit = simd_skcipher_exit; alg->setkey = simd_skcipher_setkey; alg->encrypt = simd_skcipher_encrypt; alg->decrypt = simd_skcipher_decrypt; err = crypto_register_skcipher(alg); if (err) goto out_free_salg; out: return salg; out_free_salg: kfree(salg); salg = ERR_PTR(err); goto out; } EXPORT_SYMBOL_GPL(simd_skcipher_create_compat); void simd_skcipher_free(struct simd_skcipher_alg *salg) { crypto_unregister_skcipher(&salg->alg); kfree(salg); } EXPORT_SYMBOL_GPL(simd_skcipher_free); int simd_register_skciphers_compat(struct skcipher_alg *algs, int count, struct simd_skcipher_alg **simd_algs) { int err; int i; const char *algname; const char *drvname; const char *basename; struct simd_skcipher_alg *simd; err = crypto_register_skciphers(algs, count); if (err) return err; for (i = 0; i < count; i++) { WARN_ON(strncmp(algs[i].base.cra_name, "__", 2)); WARN_ON(strncmp(algs[i].base.cra_driver_name, "__", 2)); algname = algs[i].base.cra_name + 2; drvname = algs[i].base.cra_driver_name + 2; basename = algs[i].base.cra_driver_name; simd = simd_skcipher_create_compat(algs + i, algname, drvname, basename); err = PTR_ERR(simd); if (IS_ERR(simd)) goto err_unregister; simd_algs[i] = simd; } return 0; err_unregister: simd_unregister_skciphers(algs, count, simd_algs); return err; } EXPORT_SYMBOL_GPL(simd_register_skciphers_compat); void simd_unregister_skciphers(struct skcipher_alg *algs, int count, struct simd_skcipher_alg **simd_algs) { int i; crypto_unregister_skciphers(algs, count); for (i = 0; i < count; i++) { if (simd_algs[i]) { simd_skcipher_free(simd_algs[i]); simd_algs[i] = NULL; } } } EXPORT_SYMBOL_GPL(simd_unregister_skciphers); /* AEAD support */ struct simd_aead_alg { const char *ialg_name; struct aead_alg alg; }; struct simd_aead_ctx { struct cryptd_aead *cryptd_tfm; }; static int simd_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int key_len) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *child = &ctx->cryptd_tfm->base; crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(child, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_aead_setkey(child, key, key_len); } static int simd_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_aead *child = &ctx->cryptd_tfm->base; return crypto_aead_setauthsize(child, authsize); } static int simd_aead_encrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct aead_request *subreq; struct crypto_aead *child; subreq = aead_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_aead_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_aead_child(ctx->cryptd_tfm); aead_request_set_tfm(subreq, child); return crypto_aead_encrypt(subreq); } static int simd_aead_decrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct aead_request *subreq; struct crypto_aead *child; subreq = aead_request_ctx(req); *subreq = *req; if (!crypto_simd_usable() || (in_atomic() && cryptd_aead_queued(ctx->cryptd_tfm))) child = &ctx->cryptd_tfm->base; else child = cryptd_aead_child(ctx->cryptd_tfm); aead_request_set_tfm(subreq, child); return crypto_aead_decrypt(subreq); } static void simd_aead_exit(struct crypto_aead *tfm) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); cryptd_free_aead(ctx->cryptd_tfm); } static int simd_aead_init(struct crypto_aead *tfm) { struct simd_aead_ctx *ctx = crypto_aead_ctx(tfm); struct cryptd_aead *cryptd_tfm; struct simd_aead_alg *salg; struct aead_alg *alg; unsigned reqsize; alg = crypto_aead_alg(tfm); salg = container_of(alg, struct simd_aead_alg, alg); cryptd_tfm = cryptd_alloc_aead(salg->ialg_name, CRYPTO_ALG_INTERNAL, CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; reqsize = crypto_aead_reqsize(cryptd_aead_child(cryptd_tfm)); reqsize = max(reqsize, crypto_aead_reqsize(&cryptd_tfm->base)); reqsize += sizeof(struct aead_request); crypto_aead_set_reqsize(tfm, reqsize); return 0; } static struct simd_aead_alg *simd_aead_create_compat(struct aead_alg *ialg, const char *algname, const char *drvname, const char *basename) { struct simd_aead_alg *salg; struct aead_alg *alg; int err; salg = kzalloc(sizeof(*salg), GFP_KERNEL); if (!salg) { salg = ERR_PTR(-ENOMEM); goto out; } salg->ialg_name = basename; alg = &salg->alg; err = -ENAMETOOLONG; if (snprintf(alg->base.cra_name, CRYPTO_MAX_ALG_NAME, "%s", algname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; if (snprintf(alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s", drvname) >= CRYPTO_MAX_ALG_NAME) goto out_free_salg; alg->base.cra_flags = CRYPTO_ALG_ASYNC | (ialg->base.cra_flags & CRYPTO_ALG_INHERITED_FLAGS); alg->base.cra_priority = ialg->base.cra_priority; alg->base.cra_blocksize = ialg->base.cra_blocksize; alg->base.cra_alignmask = ialg->base.cra_alignmask; alg->base.cra_module = ialg->base.cra_module; alg->base.cra_ctxsize = sizeof(struct simd_aead_ctx); alg->ivsize = ialg->ivsize; alg->maxauthsize = ialg->maxauthsize; alg->chunksize = ialg->chunksize; alg->init = simd_aead_init; alg->exit = simd_aead_exit; alg->setkey = simd_aead_setkey; alg->setauthsize = simd_aead_setauthsize; alg->encrypt = simd_aead_encrypt; alg->decrypt = simd_aead_decrypt; err = crypto_register_aead(alg); if (err) goto out_free_salg; out: return salg; out_free_salg: kfree(salg); salg = ERR_PTR(err); goto out; } static void simd_aead_free(struct simd_aead_alg *salg) { crypto_unregister_aead(&salg->alg); kfree(salg); } int simd_register_aeads_compat(struct aead_alg *algs, int count, struct simd_aead_alg **simd_algs) { int err; int i; const char *algname; const char *drvname; const char *basename; struct simd_aead_alg *simd; err = crypto_register_aeads(algs, count); if (err) return err; for (i = 0; i < count; i++) { WARN_ON(strncmp(algs[i].base.cra_name, "__", 2)); WARN_ON(strncmp(algs[i].base.cra_driver_name, "__", 2)); algname = algs[i].base.cra_name + 2; drvname = algs[i].base.cra_driver_name + 2; basename = algs[i].base.cra_driver_name; simd = simd_aead_create_compat(algs + i, algname, drvname, basename); err = PTR_ERR(simd); if (IS_ERR(simd)) goto err_unregister; simd_algs[i] = simd; } return 0; err_unregister: simd_unregister_aeads(algs, count, simd_algs); return err; } EXPORT_SYMBOL_GPL(simd_register_aeads_compat); void simd_unregister_aeads(struct aead_alg *algs, int count, struct simd_aead_alg **simd_algs) { int i; crypto_unregister_aeads(algs, count); for (i = 0; i < count; i++) { if (simd_algs[i]) { simd_aead_free(simd_algs[i]); simd_algs[i] = NULL; } } } EXPORT_SYMBOL_GPL(simd_unregister_aeads); MODULE_DESCRIPTION("Shared crypto SIMD helpers"); MODULE_LICENSE("GPL");
2 1 2 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 // SPDX-License-Identifier: GPL-2.0 /* * super.c * * Copyright (c) 1999 Al Smith * * Portions derived from work (c) 1995,1996 Christian Vogelgsang. */ #include <linux/init.h> #include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/fs_context.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); static int efs_init_fs_context(struct fs_context *fc); static void efs_kill_sb(struct super_block *s) { struct efs_sb_info *sbi = SUPER_INFO(s); kill_block_super(s); kfree(sbi); } static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, {0x01, "SGI trkrepl"}, {0x02, "SGI secrepl"}, {0x03, "SGI raw"}, {0x04, "SGI bsd"}, {SGI_SYSV, "SGI sysv"}, {0x06, "SGI vol"}, {SGI_EFS, "SGI efs"}, {0x08, "SGI lv"}, {0x09, "SGI rlv"}, {0x0A, "SGI xfs"}, {0x0B, "SGI xfslog"}, {0x0C, "SGI xlv"}, {0x82, "Linux swap"}, {0x83, "Linux native"}, {0, NULL} }; /* * File system definition and registration. */ static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = efs_init_fs_context, }; MODULE_ALIAS_FS("efs"); static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void efs_free_inode(struct inode *inode) { kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } static void init_once(void *foo) { struct efs_inode_info *ei = (struct efs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(efs_inode_cachep); } static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .free_inode = efs_free_inode, .statfs = efs_statfs, }; static const struct export_operations efs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = efs_fh_to_dentry, .fh_to_parent = efs_fh_to_parent, .get_parent = efs_get_parent, }; static int __init init_efs_fs(void) { int err; pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); err = init_inodecache(); if (err) goto out1; err = register_filesystem(&efs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_efs_fs(void) { unregister_filesystem(&efs_fs_type); destroy_inodecache(); } module_init(init_efs_fs) module_exit(exit_efs_fs) static efs_block_t efs_validate_vh(struct volume_header *vh) { int i; __be32 cs, *ui; int csum; efs_block_t sblock = 0; /* shuts up gcc */ struct pt_types *pt_entry; int pt_type, slice = -1; if (be32_to_cpu(vh->vh_magic) != VHMAGIC) { /* * assume that we're dealing with a partition and allow * read_super() to try and detect a valid superblock * on the next block. */ return 0; } ui = ((__be32 *) (vh + 1)) - 1; for(csum = 0; ui >= ((__be32 *) vh);) { cs = *ui--; csum += be32_to_cpu(cs); } if (csum) { pr_warn("SGI disklabel: checksum bad, label corrupted\n"); return 0; } #ifdef DEBUG pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); for(i = 0; i < NVDIR; i++) { int j; char name[VDNAMESIZE+1]; for(j = 0; j < VDNAMESIZE; j++) { name[j] = vh->vh_vd[i].vd_name[j]; } name[j] = (char) 0; if (name[0]) { pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); } } #endif for(i = 0; i < NPARTAB; i++) { pt_type = (int) be32_to_cpu(vh->vh_pt[i].pt_type); for(pt_entry = sgi_pt_types; pt_entry->pt_name; pt_entry++) { if (pt_type == pt_entry->pt_type) break; } #ifdef DEBUG if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), pt_type, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); } #endif if (IS_EFS(pt_type)) { sblock = be32_to_cpu(vh->vh_pt[i].pt_firstlbn); slice = i; } } if (slice == -1) { pr_notice("partition table contained no EFS partitions\n"); #ifdef DEBUG } else { pr_info("using slice %d (type %s, offset 0x%x)\n", slice, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", sblock); #endif } return sblock; } static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic))) return -1; sb->fs_magic = be32_to_cpu(super->fs_magic); sb->total_blocks = be32_to_cpu(super->fs_size); sb->first_block = be32_to_cpu(super->fs_firstcg); sb->group_size = be32_to_cpu(super->fs_cgfsize); sb->data_free = be32_to_cpu(super->fs_tfree); sb->inode_free = be32_to_cpu(super->fs_tinode); sb->inode_blocks = be16_to_cpu(super->fs_cgisize); sb->total_groups = be16_to_cpu(super->fs_ncg); return 0; } static int efs_fill_super(struct super_block *s, struct fs_context *fc) { struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) return -ENOMEM; s->s_fs_info = sb; s->s_time_min = 0; s->s_time_max = U32_MAX; s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return invalf(fc, "device does not support %d byte blocks\n", EFS_BLOCKSIZE); } /* read the vh (volume header) block */ bh = sb_bread(s, 0); if (!bh) { pr_err("cannot read volume header\n"); return -EIO; } /* * if this returns zero then we didn't find any partition table. * this isn't (yet) an error - just assume for the moment that * the device is valid and go on to search for a superblock. */ sb->fs_start = efs_validate_vh((struct volume_header *) bh->b_data); brelse(bh); if (sb->fs_start == -1) { return -EINVAL; } bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { pr_err("cannot read superblock\n"); return -EIO; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG pr_warn("invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); #endif brelse(bh); return -EINVAL; } brelse(bh); if (!sb_rdonly(s)) { #ifdef DEBUG pr_info("forcing read-only mode\n"); #endif s->s_flags |= SB_RDONLY; } s->s_op = &efs_superblock_operations; s->s_export_op = &efs_export_ops; root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { pr_err("get root inode failed\n"); return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { pr_err("get root dentry failed\n"); return -ENOMEM; } return 0; } static int efs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, efs_fill_super); } static int efs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; return 0; } static const struct fs_context_operations efs_context_opts = { .get_tree = efs_get_tree, .reconfigure = efs_reconfigure, }; /* * Set up the filesystem mount context. */ static int efs_init_fs_context(struct fs_context *fc) { fc->ops = &efs_context_opts; return 0; } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct efs_sb_info *sbi = SUPER_INFO(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ buf->f_blocks = sbi->total_groups * /* total data blocks */ (sbi->group_size - sbi->inode_blocks); buf->f_bfree = sbi->data_free; /* free data blocks */ buf->f_bavail = sbi->data_free; /* free blocks for non-root */ buf->f_files = sbi->total_groups * /* total inodes */ sbi->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sbi->inode_free; /* free inodes */ buf->f_fsid = u64_to_fsid(id); buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; }
1637 1008 1045 183 197 183 76 77 77 1 425 889 548 1044 1008 138 1367 221 1430 1429 1176 1177 1181 1180 1442 789 6075 303 304 303 84 1 371 389 146 146 21 6173 6166 6172 372 6175 1639 1042 1635 5709 603 416 32 33 6 5 1 1 1 4887 22 56 4858 6 77 1174 1174 13 1176 438 3 436 438 439 65 65 32 1246 1247 1248 592 1053 63 1035 643 6368 6366 2175 2175 1037 13 2181 5080 5084 5092 1185 1181 99 102 74 23 53 85 41 2 16 18 97 7 78 60 84 85 81 18 85 85 84 150 53 3 94 97 97 47 1381 1378 389 389 388 389 13 13 13 13 13 123 123 14 14 13 4 4 4 381 87 84 383 382 381 381 381 341 8 85 35 346 380 374 374 374 2 372 64 64 77 77 77 1 77 197 197 196 197 196 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> */ #include <linux/mm.h> #include <linux/swap.h> #include <linux/bio-integrity.h> #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/iocontext.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/highmem.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" #define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_MAX 256 struct bio_alloc_cache { struct bio *free_list; struct bio *free_list_irq; unsigned int nr; unsigned int nr_irq; }; static struct biovec_slab { int nr_vecs; char *name; struct kmem_cache *slab; } bvec_slabs[] __read_mostly = { { .nr_vecs = 16, .name = "biovec-16" }, { .nr_vecs = 64, .name = "biovec-64" }, { .nr_vecs = 128, .name = "biovec-128" }, { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, }; static struct biovec_slab *biovec_slab(unsigned short nr_vecs) { switch (nr_vecs) { /* smaller bios use inline vecs */ case 5 ... 16: return &bvec_slabs[0]; case 17 ... 64: return &bvec_slabs[1]; case 65 ... 128: return &bvec_slabs[2]; case 129 ... BIO_MAX_VECS: return &bvec_slabs[3]; default: BUG(); return NULL; } } /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by * IO code that does not need private memory pools. */ struct bio_set fs_bio_set; EXPORT_SYMBOL(fs_bio_set); /* * Our slab pool management */ struct bio_slab { struct kmem_cache *slab; unsigned int slab_ref; unsigned int slab_size; char name[12]; }; static DEFINE_MUTEX(bio_slab_lock); static DEFINE_XARRAY(bio_slabs); static struct bio_slab *create_bio_slab(unsigned int size) { struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); if (!bslab) return NULL; snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; bslab->slab_ref = 1; bslab->slab_size = size; if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) return bslab; kmem_cache_destroy(bslab->slab); fail_alloc_slab: kfree(bslab); return NULL; } static inline unsigned int bs_bio_slab_size(struct bio_set *bs) { return bs->front_pad + sizeof(struct bio) + bs->back_pad; } static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) { unsigned int size = bs_bio_slab_size(bs); struct bio_slab *bslab; mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, size); if (bslab) bslab->slab_ref++; else bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); if (bslab) return bslab->slab; return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; WARN_ON_ONCE(bslab->slab != bs->bio_slab); WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; xa_erase(&bio_slabs, slab_size); kmem_cache_destroy(bslab->slab); kfree(bslab); out: mutex_unlock(&bio_slab_lock); } void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); else if (nr_vecs > BIO_INLINE_VECS) kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); } /* * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask) { struct biovec_slab *bvs = biovec_slab(*nr_vecs); if (WARN_ON_ONCE(!bvs)) return NULL; /* * Upgrade the nr_vecs request to take full advantage of the allocation. * We also rely on this in the bvec_free path. */ *nr_vecs = bvs->nr_vecs; /* * Try a slab allocation first for all smaller allocations. If that * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. * The mempool is sized to handle up to BIO_MAX_VECS entries. */ if (*nr_vecs < BIO_MAX_VECS) { struct bio_vec *bvl; bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) return bvl; *nr_vecs = BIO_MAX_VECS; } return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio_integrity(bio)) bio_integrity_free(bio); bio_crypt_free_ctx(bio); } EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; void *p = bio; WARN_ON_ONCE(!bs); bio_uninit(bio); bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); mempool_free(p - bs->front_pad, &bs->bio_pool); } /* * Users of this function have their own bio allocation. Subsequently, * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf) { bio->bi_next = NULL; bio->bi_bdev = bdev; bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; bio->bi_write_hint = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = 0; bio->bi_end_io = NULL; bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; bio->bi_issue.value = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST bio->bi_iocost_cost = 0; #endif #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION bio->bi_crypt_context = NULL; #endif #ifdef CONFIG_BLK_DEV_INTEGRITY bio->bi_integrity = NULL; #endif bio->bi_vcnt = 0; atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset * @bdev: block device to use the bio for * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly * allocated bio returned bio bio_alloc_bioset() - the only fields that are * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; } static void bio_chain_endio(struct bio *bio) { bio_endio(__bio_chain_endio(bio)); } /** * bio_chain - chain bio completions * @bio: the target bio * @parent: the parent bio of @bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have * completed; the chained bio will also be freed when it completes. * * The caller must not set bi_private or bi_end_io in @bio. */ void bio_chain(struct bio *bio, struct bio *parent) { BUG_ON(bio->bi_private || bio->bi_end_io); bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); /** * bio_chain_and_submit - submit a bio after chaining it to another one * @prev: bio to chain and submit * @new: bio to chain to * * If @prev is non-NULL, chain it to @new and submit it. * * Return: @new. */ struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { if (prev) { bio_chain(prev, new); submit_bio(prev); } return new; } struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) { return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); } EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); struct bio *bio; while (1) { spin_lock(&bs->rescue_lock); bio = bio_list_pop(&bs->rescue_list); spin_unlock(&bs->rescue_lock); if (!bio) break; submit_bio_noacct(bio); } } static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; if (WARN_ON_ONCE(!bs->rescue_workqueue)) return; /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on * there for a stacking driver higher up in the stack, processing it * could require allocating bios from this bio_set, and doing that from * our own rescuer would be bad. * * Since bio lists are singly linked, pop them all instead of trying to * remove from the middle of the list: */ bio_list_init(&punt); bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[0]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[0] = nopunt; bio_list_init(&nopunt); while ((bio = bio_list_pop(&current->bio_list[1]))) bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); current->bio_list[1] = nopunt; spin_lock(&bs->rescue_lock); bio_list_merge(&bs->rescue_list, &punt); spin_unlock(&bs->rescue_lock); queue_work(bs->rescue_workqueue, &bs->rescue_work); } static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) { unsigned long flags; /* cache->free_list must be empty */ if (WARN_ON_ONCE(cache->free_list)) return; local_irq_save(flags); cache->free_list = cache->free_list_irq; cache->free_list_irq = NULL; cache->nr += cache->nr_irq; cache->nr_irq = 0; local_irq_restore(flags); } static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; cache = per_cpu_ptr(bs->cache, get_cpu()); if (!cache->free_list) { if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) bio_alloc_irq_cache_splice(cache); if (!cache->free_list) { put_cpu(); return NULL; } } bio = cache->free_list; cache->free_list = bio->bi_next; cache->nr--; put_cpu(); bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } /** * bio_alloc_bioset - allocate a bio for I/O * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. * * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to * allocate a bio. This is due to the mempool guarantees. To make this work, * callers must never allocate more than 1 bio at a time from the general pool. * Callers that need to allocate more than 1 bio must always submit the * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * * Note that when running under submit_bio_noacct() (i.e. any block driver), * bios are not submitted until after you return - see the code in * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under submit_bio_noacct() * would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; struct bio *bio; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; if (opf & REQ_ALLOC_CACHE) { if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, gfp_mask, bs); if (bio) return bio; /* * No cached bio available, bio returned below marked with * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache. */ } else { opf &= ~REQ_ALLOC_CACHE; } } /* * submit_bio_noacct() converts recursion to iteration; this means if * we're running beneath it, any bios we allocate and submit will not be * submitted (and thus freed) until after we return. * * This exposes us to a potential deadlock if we allocate multiple bios * from the same bio_set() while running underneath submit_bio_noacct(). * If we were to allocate multiple bios (say a stacking block driver * that was splitting bios), we would deadlock if we exhausted the * mempool's reserve. * * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are bios on * current->bio_list, we first try the allocation without * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be * blocking to the rescuer workqueue before we retry with the original * gfp_flags. */ if (current->bio_list && (!bio_list_empty(&current->bio_list[0]) || !bio_list_empty(&current->bio_list[1])) && bs->rescue_workqueue) gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(&bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; p = mempool_alloc(&bs->bio_pool, gfp_mask); } if (unlikely(!p)) return NULL; if (!mempool_is_saturated(&bs->bio_pool)) opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { struct bio_vec *bvl = NULL; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } if (unlikely(!bvl)) goto err_free; bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; return bio; err_free: mempool_free(p, &bs->bio_pool); return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); /** * bio_kmalloc - kmalloc a bio * @nr_vecs: number of bio_vecs to allocate * @gfp_mask: the GFP_* mask given to the slab allocator * * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized * using bio_init() before use. To free a bio returned from this function use * kfree() after calling bio_uninit(). A bio returned from this function can * be reused by calling bio_uninit() before calling bio_init() again. * * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this * function are not backed by a mempool can fail. Do not use this function * for allocations in the file system I/O path. * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; if (nr_vecs > UIO_MAXIOV) return NULL; return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size * @bio: the bio to be truncated * @new_size: new size for truncating the bio * * Description: * Truncate the bio to new size of @new_size. If bio_op(bio) is * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; unsigned int done = 0; bool truncated = false; if (new_size >= bio->bi_iter.bi_size) return; if (bio_op(bio) != REQ_OP_READ) goto exit; bio_for_each_segment(bv, bio, iter) { if (done + bv.bv_len > new_size) { unsigned offset; if (!truncated) offset = new_size - done; else offset = 0; zero_user(bv.bv_page, bv.bv_offset + offset, bv.bv_len - offset); truncated = true; } done += bv.bv_len; } exit: /* * Don't touch bvec table here and make it really immutable, since * fs bio user has to retrieve all pages via bio_for_each_segment_all * in its .end_bio() callback. * * It is enough to truncate bio by updating .bi_size since we can make * correct bvec with the updated .bi_size for drivers. */ bio->bi_iter.bi_size = new_size; } /** * guard_bio_eod - truncate a BIO to fit the block device * @bio: bio to truncate * * This allows us to do IO even on the odd last sectors of a device, even if the * block size is some multiple of the physical sector size. * * We'll just truncate the bio to the size of the device, and clear the end of * the buffer head manually. Truly out-of-range accesses will turn into actual * I/O errors, this only handles the "we need to be able to do I/O at the final * sector" case. */ void guard_bio_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; /* * If the *whole* IO is past the end of the device, * let it through, and the IO layer will turn it into * an EIO. */ if (unlikely(bio->bi_iter.bi_sector >= maxsector)) return; maxsector -= bio->bi_iter.bi_sector; if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; bio_truncate(bio, maxsector << 9); } static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { unsigned int i = 0; struct bio *bio; while ((bio = cache->free_list) != NULL) { cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) break; } return i; } static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int nr) { nr -= __bio_alloc_cache_prune(cache, nr); if (!READ_ONCE(cache->free_list)) { bio_alloc_irq_cache_splice(cache); __bio_alloc_cache_prune(cache, nr); } } static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) { struct bio_set *bs; bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); if (bs->cache) { struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } return 0; } static void bio_alloc_cache_destroy(struct bio_set *bs) { int cpu; if (!bs->cache) return; cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); for_each_possible_cpu(cpu) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bs->cache, cpu); bio_alloc_cache_prune(cache, -1U); } free_percpu(bs->cache); bs->cache = NULL; } static inline void bio_put_percpu_cache(struct bio *bio) { struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) goto out_free; if (in_task()) { bio_uninit(bio); bio->bi_next = cache->free_list; /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; } else if (in_hardirq()) { lockdep_assert_irqs_disabled(); bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; } else { goto out_free; } put_cpu(); return; out_free: put_cpu(); bio_free(bio); } /** * bio_put - release a reference to a bio * @bio: bio to release reference to * * Description: * Put a reference to a &struct bio, either one you have gotten with * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { if (unlikely(bio_flagged(bio, BIO_REFFED))) { BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } if (bio->bi_opf & REQ_ALLOC_CACHE) bio_put_percpu_cache(bio); else bio_free(bio); } EXPORT_SYMBOL(bio_put); static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { if (bio->bi_bdev == bio_src->bi_bdev && bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio_clone_blkg_association(bio, bio_src); } if (bio_crypt_clone(bio, bio_src, gfp) < 0) return -ENOMEM; if (bio_integrity(bio_src) && bio_integrity_clone(bio, bio_src, gfp) < 0) return -ENOMEM; return 0; } /** * bio_alloc_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio_src: bio to clone from * @gfp: allocation priority * @bs: bio_set to allocate from * * Allocate a new bio that is a clone of @bio_src. The caller owns the returned * bio, but not the actual data it points to. * * The caller must ensure that the return bio is not freed before @bio_src. */ struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs) { struct bio *bio; bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); if (!bio) return NULL; if (__bio_clone(bio, bio_src, gfp) < 0) { bio_put(bio); return NULL; } bio->bi_io_vec = bio_src->bi_io_vec; return bio; } EXPORT_SYMBOL(bio_alloc_clone); /** * bio_init_clone - clone a bio that shares the original bio's biovec * @bdev: block_device to clone onto * @bio: bio to clone into * @bio_src: bio to clone from * @gfp: allocation priority * * Initialize a new bio in caller provided memory that is a clone of @bio_src. * The caller owns the returned bio, but not the actual data it points to. * * The caller must ensure that @bio_src is not freed before @bio. */ int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp) { int ret; bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); ret = __bio_clone(bio, bio_src, gfp); if (ret) bio_uninit(bio); return ret; } EXPORT_SYMBOL(bio_init_clone); /** * bio_full - check if the bio is full * @bio: bio to check * @len: length of one segment to be added * * Return true if @bio is full and one segment with @len bytes can't be * added to the bio, otherwise return false */ static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) return true; if (bio->bi_iter.bi_size > UINT_MAX - len) return true; return false; } static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, bool *same_page) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t page_addr = page_to_phys(page); if (vec_end_addr + 1 != page_addr + off) return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) return false; *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & PAGE_MASK)); if (!*same_page) { if (IS_ENABLED(CONFIG_KMSAN)) return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) return false; } bv->bv_len += len; return true; } /* * Try to merge a page into a segment, while obeying the hardware segment * size limit. * * This is kept around for the integrity metadata, which is still tries * to build the initial bio to the hardware limit and doesn't have proper * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, bool *same_page) { unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = bvec_phys(bv); phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset, same_page); } /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio * @page: start page to add * @len: length of the data to add, may cross pages * @off: offset of the data relative to @page, may cross pages * * Add the data at @page + @off to @bio as a new bvec. The caller must ensure * that @bio has space for another bvec. */ void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; } EXPORT_SYMBOL_GPL(__bio_add_page); /** * bio_add_page - attempt to add page(s) to bio * @bio: destination bio * @page: start page to add * @len: vec entry length, may cross pages * @offset: vec entry offset relative to @page, may cross pages * * Attempt to add page(s) to the bio_vec maplist. This will only fail * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { bool same_page = false; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; if (bio->bi_iter.bi_size > UINT_MAX - len) return 0; if (bio->bi_vcnt > 0 && bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], page, len, offset, &same_page)) { bio->bi_iter.bi_size += len; return len; } if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; WARN_ON_ONCE(len > UINT_MAX); __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); /** * bio_add_folio - Attempt to add part of a folio to a bio. * @bio: BIO to add to. * @folio: Folio to add. * @len: How many bytes from the folio to add. * @off: First byte in this folio to add. * * Filesystems that use folios can call this function instead of calling * bio_add_page() for each page in the folio. If @off is bigger than * PAGE_SIZE, this function can create a bio_vec that starts in a page * after the bv_page. BIOs do not support folios that are 4GiB or larger. * * Return: Whether the addition was successful. */ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off) { unsigned long nr = off / PAGE_SIZE; if (len > UINT_MAX) return false; return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } EXPORT_SYMBOL(bio_add_folio); void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - fi.offset / PAGE_SIZE + 1; unpin_user_folio(fi.folio, nr_pages); } } EXPORT_SYMBOL_GPL(__bio_release_pages); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); } static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t offset) { bool same_page = false; if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) return -EIO; if (bio->bi_vcnt > 0 && bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], folio_page(folio, 0), len, offset, &same_page)) { bio->bi_iter.bi_size += len; if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) unpin_user_folio(folio, 1); return 0; } bio_add_folio_nofail(bio, folio, len, offset); return 0; } static unsigned int get_contig_folio_len(unsigned int *num_pages, struct page **pages, unsigned int i, struct folio *folio, size_t left, size_t offset) { size_t bytes = left; size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); unsigned int j; /* * We might COW a single page in the middle of * a large folio, so we have to check that all * pages belong to the same folio. */ bytes -= contig_sz; for (j = i + 1; j < i + *num_pages; j++) { size_t next = min_t(size_t, PAGE_SIZE, bytes); if (page_folio(pages[j]) != folio || pages[j] != pages[j - 1] + 1) { break; } contig_sz += next; bytes -= next; } *num_pages = j - i; return contig_sz; } #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * * Extracts pages from *iter and appends them to @bio's bvec array. The pages * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; size_t offset, folio_offset, left, len; int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) extraction_flags |= ITER_ALLOW_P2PDMA; /* * Each segment in the iov is required to be a block size multiple. * However, we may not be able to get the entire segment if it spans * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ size = iov_iter_extract_pages(iter, &pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); if (bio->bi_bdev) { size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); iov_iter_revert(iter, trim); size -= trim; } if (unlikely(!size)) { ret = -EFAULT; goto out; } for (left = size, i = 0; left > 0; left -= len, i += num_pages) { struct page *page = pages[i]; struct folio *folio = page_folio(page); folio_offset = ((size_t)folio_page_idx(folio, page) << PAGE_SHIFT) + offset; len = min(folio_size(folio) - folio_offset, left); num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); if (num_pages > 1) len = get_contig_folio_len(&num_pages, pages, i, folio, left, offset); bio_iov_add_folio(bio, folio, len, folio_offset); offset = 0; } iov_iter_revert(iter, left); out: while (i < nr_pages) bio_release_page(bio, pages[i++]); return ret; } /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs * to ensure the bvecs and pages stay referenced until the submitted I/O is * completed by a call to ->ki_complete() or returns with an error other than * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { int ret = 0; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); static void submit_bio_wait_endio(struct bio *bio) { complete(bio->bi_private); } /** * submit_bio_wait - submit a bio, and wait until it completes * @bio: The &struct bio which describes the I/O * * Simple wrapper around submit_bio(). Returns 0 on success, or the error from * bio_endio() on failure. * * WARNING: Unlike to how submit_bio() is usually used, this function does not * result in bio reference to be consumed. The caller must drop the reference * on his own. */ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); bio_put(bio); } /* * bio_await_chain - ends @bio and waits for every chained bio to complete */ void bio_await_chain(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; bio->bi_end_io = bio_wait_end_io; bio_endio(bio); blk_wait_io(&done); } void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { while (src_iter->bi_size && dst_iter->bi_size) { struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); void *src_buf = bvec_kmap_local(&src_bv); void *dst_buf = bvec_kmap_local(&dst_bv); memcpy(dst_buf, src_buf, bytes); kunmap_local(dst_buf); kunmap_local(src_buf); bio_advance_iter_single(src, src_iter, bytes); bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio * @dst: destination bio * * Stops when it reaches the end of either @src or @dst - that is, copies * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). */ void bio_copy_data(struct bio *dst, struct bio *src) { struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } EXPORT_SYMBOL(bio_copy_data); void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) __free_page(bvec->bv_page); } EXPORT_SYMBOL(bio_free_pages); /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the * deferred bio dirtying paths. */ /* * bio_set_pages_dirty() will mark all the bio's pages as dirty. */ void bio_set_pages_dirty(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) { folio_lock(fi.folio); folio_mark_dirty(fi.folio); folio_unlock(fi.folio); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * If they are, then fine. If, however, some pages are clean then they must * have been written out during the direct-IO read. So we take another ref on * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from * here on. It will unpin each page and will run one bio_put() against the * BIO. */ static void bio_dirty_fn(struct work_struct *work); static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); static DEFINE_SPINLOCK(bio_dirty_lock); static struct bio *bio_dirty_list; /* * This runs in process context */ static void bio_dirty_fn(struct work_struct *work) { struct bio *bio, *next; spin_lock_irq(&bio_dirty_lock); next = bio_dirty_list; bio_dirty_list = NULL; spin_unlock_irq(&bio_dirty_lock); while ((bio = next) != NULL) { next = bio->bi_private; bio_release_pages(bio, true); bio_put(bio); } } void bio_check_pages_dirty(struct bio *bio) { struct folio_iter fi; unsigned long flags; bio_for_each_folio_all(fi, bio) { if (!folio_test_dirty(fi.folio)) goto defer; } bio_release_pages(bio, false); bio_put(bio); return; defer: spin_lock_irqsave(&bio_dirty_lock, flags); bio->bi_private = bio_dirty_list; bio_dirty_list = bio; spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { /* * If we're not chaining, then ->__bi_remaining is always 1 and * we always end io on the first invocation. */ if (!bio_flagged(bio, BIO_CHAIN)) return true; BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); if (atomic_dec_and_test(&bio->__bi_remaining)) { bio_clear_flag(bio, BIO_CHAIN); return true; } return false; } /** * bio_endio - end I/O on a bio * @bio: bio * * Description: * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred * way to end I/O on a bio. No one should call bi_end_io() directly on a * bio unless they own it and thus know that it has an end_io function. * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the * last time. **/ void bio_endio(struct bio *bio) { again: if (!bio_remaining_done(bio)) return; if (!bio_integrity_endio(bio)) return; blk_zone_bio_endio(bio); rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that * save/restore bi_end_io) - however, we want to avoid unbounded * recursion and blowing the stack. Tail call optimization would * handle this, but compiling with frame pointers also disables * gcc's sibling call optimization. */ if (bio->bi_end_io == bio_chain_endio) { bio = __bio_chain_endio(bio); goto again; } #ifdef CONFIG_BLK_CGROUP /* * Release cgroup info. We shouldn't have to do this here, but quite * a few callers of bio_init fail to call bio_uninit, so we cover up * for that here at least for now. */ if (bio->bi_blkg) { blkg_put(bio->bi_blkg); bio->bi_blkg = NULL; } #endif if (bio->bi_end_io) bio->bi_end_io(bio); } EXPORT_SYMBOL(bio_endio); /** * bio_split - split a bio * @bio: bio to split * @sectors: number of sectors to split from the front of @bio * @gfp: gfp mask * @bs: bio set to allocate from * * Allocates and returns a new bio which represents @sectors from the start of * @bio, and updates @bio to represent the remaining sectors. * * Unless this is a discard request the newly allocated bio will point * to @bio's bi_io_vec. It is the caller's responsibility to ensure that * neither @bio nor @bs are freed before the split bio. */ struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) { struct bio *split; if (WARN_ON_ONCE(sectors <= 0)) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(sectors >= bio_sectors(bio))) return ERR_PTR(-EINVAL); /* Zone append commands cannot be split */ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return ERR_PTR(-EINVAL); /* atomic writes cannot be split */ if (bio->bi_opf & REQ_ATOMIC) return ERR_PTR(-EINVAL); split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return ERR_PTR(-ENOMEM); split->bi_iter.bi_size = sectors << 9; if (bio_integrity(split)) bio_integrity_trim(split); bio_advance(bio, split->bi_iter.bi_size); if (bio_flagged(bio, BIO_TRACE_COMPLETION)) bio_set_flag(split, BIO_TRACE_COMPLETION); return split; } EXPORT_SYMBOL(bio_split); /** * bio_trim - trim a bio * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors * * This function is typically used for bios that are cloned and submitted * to the underlying device in parts. */ void bio_trim(struct bio *bio, sector_t offset, sector_t size) { /* We should never trim an atomic write */ if (WARN_ON_ONCE(bio->bi_opf & REQ_ATOMIC && size)) return; if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || offset + size > bio_sectors(bio))) return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) return; bio_advance(bio, offset << 9); bio->bi_iter.bi_size = size; if (bio_integrity(bio)) bio_integrity_trim(bio); } EXPORT_SYMBOL_GPL(bio_trim); /* * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ int biovec_init_pool(mempool_t *pool, int pool_entries) { struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } /* * bioset_exit - exit a bioset initialized with bioset_init() * * May be called on a zeroed but uninitialized bioset (i.e. allocated with * kzalloc()). */ void bioset_exit(struct bio_set *bs) { bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; mempool_exit(&bs->bio_pool); mempool_exit(&bs->bvec_pool); if (bs->bio_slab) bio_put_slab(bs); bs->bio_slab = NULL; } EXPORT_SYMBOL(bioset_exit); /** * bioset_init - Initialize a bio_set * @bs: pool to initialize * @pool_size: Number of bio and bio_vecs to cache in the mempool * @front_pad: Number of bytes to allocate in front of the returned bio * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS * and %BIOSET_NEED_RESCUER * * Description: * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller * to ask for a number of bytes to be allocated in front of the bio. * Front pad allocation is useful for embedding the bio inside * another structure, to avoid allocating extra data to go with the bio. * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, unsigned int pool_size, unsigned int front_pad, int flags) { bs->front_pad = front_pad; if (flags & BIOSET_NEED_BVECS) bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); else bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab)) goto bad; if ((flags & BIOSET_NEED_BVECS) && biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; if (flags & BIOSET_NEED_RESCUER) { bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); if (!bs->rescue_workqueue) goto bad; } if (flags & BIOSET_PERCPU_CACHE) { bs->cache = alloc_percpu(struct bio_alloc_cache); if (!bs->cache) goto bad; cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); } return 0; bad: bioset_exit(bs); return -ENOMEM; } EXPORT_SYMBOL(bioset_init); static int __init init_bio(void) { int i; BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; bvs->slab = kmem_cache_create(bvs->name, bvs->nr_vecs * sizeof(struct bio_vec), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, bio_cpu_dead); if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); return 0; } subsys_initcall(init_bio);
879 711 711 268 708 704 702 435 704 703 418 781 203 276 734 5 732 560 559 1 293 485 3 488 534 504 504 505 688 38 735 491 328 683 433 308 1194 667 551 764 471 294 602 383 219 480 312 167 218 101 118 375 221 155 238 137 102 91 38 53 65 23 42 417 175 419 417 415 416 417 416 417 416 284 135 257 40 227 285 94 2 62 206 13 4 10 39 101 64 39 1 39 681 6 682 131 1 132 43 2 42 59 2 43 16 43 16 50 50 7 145 11 5 139 139 11 33 26 7 30 556 404 404 404 8 74 73 74 74 72 74 19 19 19 8 11 19 11 11 11 6 5 11 11 11 11 11 853 855 482 189 133 256 483 452 451 278 212 136 450 451 4 449 23 19 2 2 23 609 608 607 608 52 1 2 1 51 34 33 1 2 578 552 575 576 27 8 1 36 38 37 24 1 6 124 15 538 107 26 1 9 1 1 62 35 32 2 35 1 18 27 40 44 31 37 1 32 3 1 1 22 34 2 4 9 9 26 17 3 1 372 237 343 34 343 234 517 247 243 69 32 38 69 67 232 81 3 546 19 471 95 96 96 6 75 21 450 1 83 14 527 521 529 524 57 470 43 485 529 2 44 4 1 543 2 4 22 536 20 575 573 1 573 1 2 571 543 2 490 70 1 70 70 1 65 4 567 555 555 554 555 555 555 555 555 555 556 556 555 556 12 21 21 21 19 2 19 2 21 21 21 21 21 21 21 21 21 21 21 21 21 2 10 11 20 1 21 21 19 2 17 4 17 4 21 7 10 21 21 21 21 1 20 21 20 14 7 20 1 21 21 429 27 50 5 15 337 334 353 348 4 352 372 56 161 160 160 161 160 161 161 171 10 161 161 161 160 161 161 284 55 134 95 429 2 440 284 156 447 457 457 454 451 6 448 7 1 455 453 2 448 5 2 451 449 2 447 3 2 447 413 33 192 254 446 317 319 481 482 163 320 383 483 296 101 61 447 428 19 427 420 72 510 1 90 5 1 464 2 5 37 43 6 41 289 290 12 7 8 290 289 340 419 8 72 341 340 270 290 342 271 289 310 33 33 33 33 33 33 33 33 31 33 33 344 344 344 343 343 319 2 2 23 414 187 226 396 232 481 41 430 92 417 105 473 50 504 20 427 48 49 438 86 97 425 98 424 524 475 475 1 137 1 335 1 1 2 27 442 447 446 447 22 1 2 1 498 183 322 1 13 1 1 9 4 502 2 1 522 3 525 1 524 1 6 165 350 500 479 15 4 479 1 20 473 27 7 466 27 7 1 485 483 1 1 481 467 1 464 1 466 466 1 464 1 460 2 460 458 2 458 281 177 1 458 458 457 457 458 1 457 457 455 457 15 15 13 503 75 1 1 33 40 12 61 426 425 1 1 1 423 533 2 1 1 1 1 342 183 1 180 4 476 1 1 274 31 69 9 92 482 483 483 157 1 323 1 2 2 477 402 19 512 533 533 2 526 524 524 517 13 505 2 18 6 2 6 10 12 399 48 274 3 168 315 132 337 15 88 1 4 421 26 129 266 422 21 310 110 344 5 3 387 27 60 343 9 15 390 1 404 233 169 2 275 126 396 395 208 187 26 26 130 130 130 396 533 527 5 533 396 396 137 533 32 7 25 3 4 4 2 6 10 5 2 7 6 6 4 610 609 60 561 610 348 442 610 383 175 359 416 169 380 608 610 609 610 610 610 62 7 57 10 14 10 14 48 20 5 5 5 5 53 13 13 63 3 60 57 5 2 61 2 63 4 1 58 15 6 2 16 18 20 22 1 19 6 12 45 4 20 29 32 6 3 8 40 4 24 20 43 33 10 21 20 1 21 21 73 11 63 21 35 35 2 33 35 35 35 35 2 35 242 144 144 144 5 142 199 199 12 154 61 144 143 1 3 1 2 5 1 1 3 1 1 1 3 26 150 5 128 21 170 162 159 150 128 30 35 34 13 22 32 12 1 1 257 258 258 2 257 4 196 196 195 8 28 193 192 69 5 3 3 2 2 148 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> #include <linux/random.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" #include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering * * page fault path: * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> * page lock * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) */ static const struct fs_context_operations ext4_context_ops = { .parse_param = ext4_parse_param, .get_tree = ext4_get_tree, .reconfigure = ext4_reconfigure, .free = ext4_fc_free, }; #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { if (simu_fail) { clear_buffer_uptodate(bh); unlock_buffer(bh); return; } /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we * recheck the buffer contents. */ clear_buffer_verified(bh); bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return; } __ext4_read_bh(bh, op_flags, end_io, simu_fail); } int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return 0; } __ext4_read_bh(bh, op_flags, end_io, simu_fail); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; return -EIO; } int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL, false); return 0; } return ext4_read_bh(bh, op_flags, NULL, false); } /* * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, sector_t block, blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); brelse(bh); } } static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; csum = ext4_chksum(sbi, ~0, (char *)es, offset); return cpu_to_le32(csum); } static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (!ext4_has_feature_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_table_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); } void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); } void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); } void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); *hi = upper_32_bits(now); } static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) { return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. * * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last * superblock update * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the * last superblock update. * * @sb: The superblock */ static void ext4_maybe_update_superblock(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; journal_t *journal = sbi->s_journal; time64_t now; __u64 last_update; __u64 lifetime_write_kbytes; __u64 diff_size; if (ext4_emergency_state(sb) || sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || !journal || journal->j_flags & JBD2_UNMOUNT) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); if (likely(now - last_update < sbi->s_sb_update_sec)) return; lifetime_write_kbytes = sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); /* Get the number of kilobytes not written to disk to account * for statistics and compare with a multiple of 16 MB. This * is used to determine when the next superblock commit should * occur (i.e. not more often than once per 16MB if there was * less written in an hour). */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > sbi->s_sb_update_kb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); } /* * This writepage callback for write_cache_pages() * takes care of a few cases after page cleaning. * * write_cache_pages() already checks for dirty pages * and calls clear_page_dirty_for_io(), which we want, * to write protect the pages. * * However, we may have to redirty a page (see below.) */ static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: * 1) If buffer is dirty, it means the page was dirty because it * contains a buffer that needs checkpointing. So the dirty bit * needs to be preserved so that checkpointing writes the buffer * properly. * 2) If buffer is not part of the committing transaction * (we may have just accidentally come across this buffer because * inode range tracking is not exact) or if the currently running * transaction already contains this buffer as well, dirty bit * needs to be preserved so that the buffer gets writeprotected * properly on running transaction's commit. */ jh = bh2jh(bh); if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); out: return AOP_WRITEPAGE_ACTIVATE; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; return write_cache_pages(mapping, &wbc, ext4_journalled_writepage_callback, jinode->i_transaction); } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { int ret; if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { int ret = 0; if (!ext4_should_journal_data(jinode->i_vfs_inode)) ret = jbd2_journal_finish_inode_data_buffers(jinode); return ret; } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } struct ext4_err_translation { int code; int errno; }; #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } static struct ext4_err_translation err_translation[] = { EXT4_ERR_TRANSLATE(EIO), EXT4_ERR_TRANSLATE(ENOMEM), EXT4_ERR_TRANSLATE(EFSBADCRC), EXT4_ERR_TRANSLATE(EFSCORRUPTED), EXT4_ERR_TRANSLATE(ENOSPC), EXT4_ERR_TRANSLATE(ENOKEY), EXT4_ERR_TRANSLATE(EROFS), EXT4_ERR_TRANSLATE(EFBIG), EXT4_ERR_TRANSLATE(EEXIST), EXT4_ERR_TRANSLATE(ERANGE), EXT4_ERR_TRANSLATE(EOVERFLOW), EXT4_ERR_TRANSLATE(EBUSY), EXT4_ERR_TRANSLATE(ENOTDIR), EXT4_ERR_TRANSLATE(ENOTEMPTY), EXT4_ERR_TRANSLATE(ESHUTDOWN), EXT4_ERR_TRANSLATE(EFAULT), }; static int ext4_errno_to_code(int errno) { int i; for (i = 0; i < ARRAY_SIZE(err_translation); i++) if (err_translation[i].errno == errno) return err_translation[i].code; return EXT4_ERR_UNKNOWN; } static void save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; spin_lock(&sbi->s_error_lock); sbi->s_add_error_count++; sbi->s_last_error_code = error; sbi->s_last_error_line = line; sbi->s_last_error_ino = ino; sbi->s_last_error_block = block; sbi->s_last_error_func = func; sbi->s_last_error_time = ktime_get_real_seconds(); if (!sbi->s_first_error_time) { sbi->s_first_error_code = error; sbi->s_first_error_line = line; sbi->s_first_error_ino = ino; sbi->s_first_error_block = block; sbi->s_first_error_func = func; sbi->s_first_error_time = sbi->s_last_error_time; } spin_unlock(&sbi->s_error_lock); } /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * * On ext2, we can store the error state of the filesystem in the * superblock. That is not possible on ext4, because we may have other * write ordering constraints on the superblock which prevent us from * writing it out straight away; and given that the journal is about to * be aborted, we can't rely on the current, or future, transactions to * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. * * If force_ro is set, we unconditionally force the filesystem into an * ABORT|READONLY state, unless the error response on the fs has been set to * panic in which case we take the easy way out and panic immediately. This is * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); if (!continue_fs && !ext4_emergency_ro(sb) && journal) jbd2_journal_abort(journal, -EIO); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); /* * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. We just need to be * careful when the journal is already shutting down. If we get * here in that case, just update the sb directly as the last * transaction won't commit anyway. */ if (continue_fs && journal && !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } if (ext4_emergency_ro(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * We don't set SB_RDONLY because that requires sb->s_umount * semaphore and setting it without proper remount procedure is * confusing code such as freeze_super() leading to deadlocks * and other problems. */ set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; /* * If the journal is still running, we have to write out superblock * through the journal to avoid collisions of other journalled sb * updates. * * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ if (!ext4_emergency_state(sbi->s_sb) && !sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } if (sbi->s_add_error_count > 0) call_notify_err = true; ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } jbd2_journal_stop(handle); if (call_notify_err) ext4_notify_error_sysfs(sbi); return; } write_directly: /* * Write through journal failed. Write sb directly to get error info * out and hope for the best. */ ext4_commit_super(sbi->s_sb); ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; va_list args; if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); } fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } void __ext4_error_file(struct file *file, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; struct va_format vaf; struct inode *inode = file_inode(file); char pathname[80], *path; if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; switch (errno) { case -EFSCORRUPTED: errstr = "Corrupt filesystem"; break; case -EFSBADCRC: errstr = "Filesystem failed CRC"; break; case -EIO: errstr = "IO failure"; break; case -ENOMEM: errstr = "Out of memory"; break; case -EROFS: if (!sb || (EXT4_SB(sb)->s_journal && EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; break; default: /* If the caller passed in an extra buffer for unknown * errors, textualise them now. Else we just return * NULL. */ if (nbuf) { /* Check for truncated error codes... */ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) errstr = nbuf; } break; } return errstr; } /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ void __ext4_std_error(struct super_block *sb, const char *function, unsigned int line, int errno) { char nbuf[16]; const char *errstr; if (unlikely(ext4_emergency_state(sb))) return; /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; if (sb) { atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); else printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } static int ext4_warning_ratelimit(struct super_block *sb) { atomic_inc(&EXT4_SB(sb)->s_warning_count); return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), "EXT4-fs warning"); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", sb->s_id, function, line, &vaf); va_end(args); } void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(inode->i_sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { struct va_format vaf; va_list args; if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); printk(KERN_CONT "%pV\n", &vaf); va_end(args); } if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } ext4_unlock_group(sb, grp); ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the * filesystem will have already been marked read/only and the * journal has been aborted. We return 1 as a hint to callers * who might what to use the return value from * ext4_grp_locked_error() to distinguish between the * ERRORS_CONT and ERRORS_RO case, and perhaps return more * aggressively from the ext4 function in question, with a * more appropriate error code. */ ext4_lock_group(sb, grp); return; } void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t group, unsigned int flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; if (!grp || !gdp) return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); } if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } } } void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); /* leave es->s_feature_*compat flags alone */ /* es->s_uuid will be set by e2fsck if empty */ /* * The rest of the superblock fields should be zero, and if not it * means they are likely already in use, so leave them alone. We * can leave it up to e2fsck to clean up any inconsistencies there. */ } static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; } static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; ext4_msg(sb, KERN_ERR, "sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); static inline void ext4_quotas_off(struct super_block *sb, int type) { BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ for (type--; type >= 0; type--) ext4_quota_off(sb, type); } /* * This is a helper function which is used in the mount/remount * codepaths (which holds s_umount) to fetch the quota file name. */ static inline char *get_qf_name(struct super_block *sb, struct ext4_sb_info *sbi, int type) { return rcu_dereference_protected(sbi->s_qf_names[type], lockdep_is_held(&sb->s_umount)); } #else static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif static int ext4_percpu_param_init(struct ext4_sb_info *sbi) { ext4_fsblk_t block; int err; block = ext4_count_free_clusters(sbi->s_sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sbi->s_sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sbi->s_sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); return err; } static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); } static void ext4_group_desc_free(struct ext4_sb_info *sbi) { struct buffer_head **group_desc; int i; rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) { struct flex_groups **flex_groups; int i; rcu_read_lock(); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; int err; /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } } else flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { if (!aborted) { ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } ext4_commit_super(sb); } ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) && percpu_counter_sum(&sbi->s_dirtyclusters_counter)); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list * isn't empty. The on-disk one can be non-empty if we've * detected an error and taken the fs readonly, but the * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; ext4_stop_mmpd(sbi); brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); } static struct kmem_cache *ext4_inode_cachep; /* * Called inside transaction, so use GFP_NOFS */ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); if (!drop) drop = fscrypt_drop_inode(inode); trace_ext4_drop_inode(inode, drop); return drop; } static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { pr_warn("%s: inode %ld still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } static void ext4_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); dump_stack(); } if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } static void ext4_shutdown(struct super_block *sb) { ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); } static void init_once(void *foo) { struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static int ext4_nfs_commit_metadata(struct inode *inode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL }; trace_ext4_nfs_commit_metadata(inode); return ext4_write_inode(inode, &wbc); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static struct dquot __rcu **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #endif static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif }; static const struct export_operations ext4_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, .commit_metadata = ext4_nfs_commit_metadata, }; enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; static const struct constant_table ext4_param_errors[] = { {"continue", EXT4_MOUNT_ERRORS_CONT}, {"panic", EXT4_MOUNT_ERRORS_PANIC}, {"remount-ro", EXT4_MOUNT_ERRORS_RO}, {} }; static const struct constant_table ext4_param_data[] = { {"journal", EXT4_MOUNT_JOURNAL_DATA}, {"ordered", EXT4_MOUNT_ORDERED_DATA}, {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, {} }; static const struct constant_table ext4_param_data_err[] = { {"abort", Opt_data_err_abort}, {"ignore", Opt_data_err_ignore}, {} }; static const struct constant_table ext4_param_jqfmt[] = { {"vfsold", QFMT_VFS_OLD}, {"vfsv0", QFMT_VFS_V0}, {"vfsv1", QFMT_VFS_V1}, {} }; static const struct constant_table ext4_param_dax[] = { {"always", Opt_dax_always}, {"inode", Opt_dax_inode}, {"never", Opt_dax_never}, {} }; /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the * options and the way we show them in _ext4_show_options(). To * keep the changes to a minimum, let's keep the negative options * separate for now. */ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsddf", Opt_bsd_df), fsparam_flag ("minixdf", Opt_minix_df), fsparam_flag ("grpid", Opt_grpid), fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), fsparam_gid ("resgid", Opt_resgid), fsparam_uid ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), fsparam_flag ("debug", Opt_debug), fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), fsparam_flag ("acl", Opt_acl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), fsparam_flag ("nobh", Opt_removed), fsparam_u32 ("commit", Opt_commit), fsparam_u32 ("min_batch_time", Opt_min_batch_time), fsparam_u32 ("max_batch_time", Opt_max_batch_time), fsparam_u32 ("journal_dev", Opt_journal_dev), fsparam_bdev ("journal_path", Opt_journal_path), fsparam_flag ("journal_checksum", Opt_journal_checksum), fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), fsparam_flag ("journal_async_commit",Opt_journal_async_commit), fsparam_flag ("abort", Opt_abort), fsparam_enum ("data", Opt_data, ext4_param_data), fsparam_enum ("data_err", Opt_data_err, ext4_param_data_err), fsparam_string_empty ("usrjquota", Opt_usrjquota), fsparam_string_empty ("grpjquota", Opt_grpjquota), fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), fsparam_flag ("grpquota", Opt_grpquota), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_noquota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("prjquota", Opt_prjquota), fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), fsparam_flag ("delalloc", Opt_delalloc), fsparam_flag ("nodelalloc", Opt_nodelalloc), fsparam_flag ("warn_on_error", Opt_warn_on_error), fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), fsparam_u32 ("debug_want_extra_isize", Opt_debug_want_extra_isize), fsparam_flag ("mblk_io_submit", Opt_removed), fsparam_flag ("nomblk_io_submit", Opt_removed), fsparam_flag ("block_validity", Opt_block_validity), fsparam_flag ("noblock_validity", Opt_noblock_validity), fsparam_u32 ("inode_readahead_blks", Opt_inode_readahead_blks), fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), fsparam_flag ("dioread_nolock", Opt_dioread_nolock), fsparam_flag ("nodioread_nolock", Opt_dioread_lock), fsparam_flag ("dioread_lock", Opt_dioread_lock), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nodiscard", Opt_nodiscard), fsparam_u32 ("init_itable", Opt_init_itable), fsparam_flag ("init_itable", Opt_init_itable), fsparam_flag ("noinit_itable", Opt_noinit_itable), #ifdef CONFIG_EXT4_DEBUG fsparam_flag ("fc_debug_force", Opt_fc_debug_force), fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), #endif fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_flag ("inlinecrypt", Opt_inlinecrypt), fsparam_flag ("nombcache", Opt_nombcache), fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ fsparam_flag ("prefetch_block_bitmaps", Opt_removed), fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ {} }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 #ifdef CONFIG_QUOTA #define MOPT_Q 0 #define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_NO_EXT2 0x0020 #define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_SKIP 0x0080 #define MOPT_2 0x0100 static const struct mount_opts { int token; int mount_opt; int flags; } ext4_mount_opts[] = { {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, {Opt_dax_type, 0, MOPT_EXT4_ONLY}, {Opt_journal_dev, 0, MOPT_NO_EXT2}, {Opt_journal_path, 0, MOPT_NO_EXT2}, {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; #if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; unsigned int version; } ext4_sb_encoding_map[] = { {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct ext4_sb_encodings * ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) return &ext4_sb_encoding_map[i]; return NULL; } #endif #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) #define EXT4_SPEC_s_inode_readahead_blks (1 << 10) #define EXT4_SPEC_s_li_wait_mult (1 << 11) #define EXT4_SPEC_s_max_dir_size_kb (1 << 12) #define EXT4_SPEC_s_stripe (1 << 13) #define EXT4_SPEC_s_resuid (1 << 14) #define EXT4_SPEC_s_resgid (1 << 15) #define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_mb_optimize_scan (1 << 19) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif unsigned short qname_spec; unsigned long vals_s_flags; /* Bits to set in s_flags */ unsigned long mask_s_flags; /* Bits changed in s_flags */ unsigned long journal_devnum; unsigned long s_commit_interval; unsigned long s_stripe; unsigned int s_inode_readahead_blks; unsigned int s_want_extra_isize; unsigned int s_li_wait_mult; unsigned int s_max_dir_size_kb; unsigned int journal_ioprio; unsigned int vals_s_mount_opt; unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; u32 s_min_batch_time; kuid_t s_resuid; kgid_t s_resgid; ext4_fsblk_t s_sb_block; }; static void ext4_fc_free(struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; int i; if (!ctx) return; for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } int ext4_init_fs_context(struct fs_context *fc) { struct ext4_fs_context *ctx; ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &ext4_context_ops; return 0; } #ifdef CONFIG_QUOTA /* * Note the name of the specified quota file. */ static int note_qf_name(struct fs_context *fc, int qtype, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; char *qname; if (param->size < 1) { ext4_msg(NULL, KERN_ERR, "Missing quota name"); return -EINVAL; } if (strchr(param->string, '/')) { ext4_msg(NULL, KERN_ERR, "quotafile must be on filesystem root"); return -EINVAL; } if (ctx->s_qf_names[qtype]) { if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); return -EINVAL; } return 0; } qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { ext4_msg(NULL, KERN_ERR, "Not enough memory for storing quotafile name"); return -ENOMEM; } ctx->s_qf_names[qtype] = qname; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } /* * Clear the name of the specified quota file. */ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } #endif static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, struct ext4_fs_context *ctx) { int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption option not supported"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(param, &ctx->dummy_enc_policy); if (err == -EINVAL) { ext4_msg(NULL, KERN_WARNING, "Value of option \"%s\" is unrecognized", param->key); } else if (err == -EEXIST) { ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return err; } #define EXT4_SET_CTX(name) \ static inline __maybe_unused \ void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ static inline __maybe_unused \ void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ } #define EXT4_TEST_CTX(name) \ static inline unsigned long \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ return (ctx->vals_s_##name & flag); \ } EXT4_SET_CTX(flags); /* set only */ EXT4_SET_CTX(mount_opt); EXT4_CLEAR_CTX(mount_opt); EXT4_TEST_CTX(mount_opt); EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; int token; token = fs_parse(fc, ext4_param_specs, param, &result); if (token < 0) return token; is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; ctx->opt_flags |= m->flags; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { ext4_msg(NULL, KERN_ERR, "%s option not supported", param->key); return 0; } switch (token) { #ifdef CONFIG_QUOTA case Opt_usrjquota: if (!*param->string) return unnote_qf_name(fc, USRQUOTA); else return note_qf_name(fc, USRQUOTA, param); case Opt_grpjquota: if (!*param->string) return unnote_qf_name(fc, GRPQUOTA); else return note_qf_name(fc, GRPQUOTA, param); #endif case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, "Ignoring %s option on remount", param->key); } else { ctx->s_sb_block = result.uint_32; ctx->spec |= EXT4_SPEC_s_sb_block; } return 0; case Opt_removed: ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); #else ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); #endif return 0; case Opt_errors: ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); ctx_set_mount_opt(ctx, result.uint_32); return 0; #ifdef CONFIG_QUOTA case Opt_jqfmt: ctx->s_jquota_fmt = result.uint_32; ctx->spec |= EXT4_SPEC_JQFMT; return 0; #endif case Opt_data: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); ctx_set_mount_opt(ctx, result.uint_32); ctx->spec |= EXT4_SPEC_DATAJ; return 0; case Opt_commit: if (result.uint_32 == 0) result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", result.uint_32, INT_MAX / HZ); return -EINVAL; } ctx->s_commit_interval = HZ * result.uint_32; ctx->spec |= EXT4_SPEC_s_commit_interval; return 0; case Opt_debug_want_extra_isize: if ((result.uint_32 & 1) || (result.uint_32 < 4)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", result.uint_32); return -EINVAL; } ctx->s_want_extra_isize = result.uint_32; ctx->spec |= EXT4_SPEC_s_want_extra_isize; return 0; case Opt_max_batch_time: ctx->s_max_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_batch_time; return 0; case Opt_min_batch_time: ctx->s_min_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_min_batch_time; return 0; case Opt_inode_readahead_blks: if (result.uint_32 && (result.uint_32 > (1 << 30) || !is_power_of_2(result.uint_32))) { ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); return -EINVAL; } ctx->s_inode_readahead_blks = result.uint_32; ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; return 0; case Opt_init_itable: ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (param->type == fs_value_is_string) ctx->s_li_wait_mult = result.uint_32; ctx->spec |= EXT4_SPEC_s_li_wait_mult; return 0; case Opt_max_dir_size_kb: ctx->s_max_dir_size_kb = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; return 0; #ifdef CONFIG_EXT4_DEBUG case Opt_fc_debug_max_replay: ctx->s_fc_debug_max_replay = result.uint_32; ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; return 0; #endif case Opt_stripe: ctx->s_stripe = result.uint_32; ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: ctx->s_resuid = result.uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: ctx->s_resgid = result.gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } ctx->journal_devnum = result.uint_32; ctx->spec |= EXT4_SPEC_JOURNAL_DEV; return 0; case Opt_journal_path: { struct inode *journal_inode; struct path path; int error; if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); return -EINVAL; } journal_inode = d_inode(path.dentry); ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); return 0; } case Opt_journal_ioprio: if (result.uint_32 > 7) { ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); return -EINVAL; } ctx->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX { int type = (token == Opt_dax) ? Opt_dax : result.uint_32; switch (type) { case Opt_dax: case Opt_dax_always: ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } return 0; } #else ext4_msg(NULL, KERN_INFO, "dax option not supported"); return -EINVAL; #endif case Opt_data_err: if (result.uint_32 == Opt_data_err_abort) ctx_set_mount_opt(ctx, m->mount_opt); else if (result.uint_32 == Opt_data_err_ignore) ctx_clear_mount_opt(ctx, m->mount_opt); return 0; case Opt_mb_optimize_scan: if (result.int_32 == 1) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else if (result.int_32 == 0) { ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else { ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); return -EINVAL; } return 0; } /* * At this point we should only be getting options requiring MOPT_SET, * or MOPT_CLEAR. Anything else is a bug */ if (m->token == Opt_err) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } else { unsigned int set = 0; if ((param->type == fs_value_is_flag) || result.uint_32 > 0) set = 1; if (m->flags & MOPT_CLEAR) set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } if (m->flags & MOPT_2) { if (set != 0) ctx_set_mount_opt2(ctx, m->mount_opt); else ctx_clear_mount_opt2(ctx, m->mount_opt); } else { if (set != 0) ctx_set_mount_opt(ctx, m->mount_opt); else ctx_clear_mount_opt(ctx, m->mount_opt); } } return 0; } static int parse_options(struct fs_context *fc, char *options) { struct fs_parameter param; int ret; char *key; if (!options) return 0; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); param.type = fs_value_is_flag; param.string = NULL; if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); param.string = kmemdup_nul(value, v_len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } param.key = key; param.size = v_len; ret = ext4_parse_param(fc, &param); kfree(param.string); if (ret < 0) return ret; } } ret = ext4_validate_options(fc); if (ret < 0) return ret; return 0; } static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *s_mount_opts = NULL; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; if (!sbi->s_es->s_mount_opts[0]) return 0; s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), GFP_KERNEL); if (!s_mount_opts) return ret; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) goto out_free; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) goto out_free; fc->fs_private = s_ctx; fc->s_fs_info = sbi; ret = parse_options(fc, s_mount_opts); if (ret < 0) goto parse_failed; ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) { parse_failed: ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); ret = 0; goto out_free; } if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) m_ctx->journal_devnum = s_ctx->journal_devnum; if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; ext4_apply_options(fc, sb); ret = 0; out_free: if (fc) { ext4_fc_free(fc); kfree(fc); } kfree(s_mount_opts); return ret; } static void ext4_apply_quota_options(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA bool quota_feature = ext4_has_feature_quota(sb); struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; int i; if (quota_feature) return; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; qname = ctx->s_qf_names[i]; /* May be NULL */ if (qname) set_opt(sb, QUOTA); ctx->s_qf_names[i] = NULL; qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) kfree_rcu_mightsleep(qname); } } if (ctx->spec & EXT4_SPEC_JQFMT) sbi->s_jquota_fmt = ctx->s_jquota_fmt; #endif } /* * Check quota settings consistency. */ static int ext4_check_quota_consistency(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); bool quota_feature = ext4_has_feature_quota(sb); bool quota_loaded = sb_any_quota_loaded(sb); bool usr_qf_name, grp_qf_name, usrquota, grpquota; int quota_flags, i; /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && !ext4_has_feature_project(sb)) { ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -EINVAL; } quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; if (quota_loaded && ctx->mask_s_mount_opt & quota_flags && !ctx_test_mount_opt(ctx, quota_flags)) goto err_quota_change; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; if (quota_loaded && !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) goto err_jquota_change; if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && strcmp(get_qf_name(sb, sbi, i), ctx->s_qf_names[i]) != 0) goto err_jquota_specified; } if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Journaled quota options ignored when " "QUOTA feature is enabled"); return 0; } } if (ctx->spec & EXT4_SPEC_JQFMT) { if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) goto err_jquota_change; if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Quota format mount options " "ignored when QUOTA feature is enabled"); return 0; } } /* Make sure we don't mix old and new quota format */ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || ctx->s_qf_names[USRQUOTA]); grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || ctx->s_qf_names[GRPQUOTA]); usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || test_opt(sb, USRQUOTA)); grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || test_opt(sb, GRPQUOTA)); if (usr_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); usrquota = false; } if (grp_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); grpquota = false; } if (usr_qf_name || grp_qf_name) { if (usrquota || grpquota) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { ext4_msg(NULL, KERN_ERR, "journaled quota format " "not specified"); return -EINVAL; } } return 0; err_quota_change: ext4_msg(NULL, KERN_ERR, "Cannot change quota options when quota turned on"); return -EINVAL; err_jquota_change: ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " "options when quota turned on"); return -EINVAL; err_jquota_specified: ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(i)); return -EINVAL; #else return 0; #endif } static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption requires encrypt feature"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure s_mount_opts didn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, struct super_block *sb) { if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || /* if already set, it was already verified to be the same */ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) return; EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext2"); return -EINVAL; } if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext3"); return -EINVAL; } if (ctx->s_want_extra_isize > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", ctx->s_want_extra_isize); return -EINVAL; } err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { ext4_msg(NULL, KERN_WARNING, "Remounting file system with no journal " "so ignoring journalled data option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != test_opt(sb, DATA_FLAGS)) { ext4_msg(NULL, KERN_ERR, "Cannot change data mode " "on remount"); return -EINVAL; } } if (is_remount) { if (!sbi->s_journal && ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { ext4_msg(NULL, KERN_WARNING, "Remounting fs w/o journal so ignoring data_err option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { fail_dax_change_remount: ext4_msg(NULL, KERN_ERR, "can't change " "dax mount option while remounting"); return -EINVAL; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { goto fail_dax_change_remount; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } } return ext4_check_quota_consistency(fc, sb); } static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); APPLY(s_max_batch_time); APPLY(s_min_batch_time); APPLY(s_want_extra_isize); APPLY(s_inode_readahead_blks); APPLY(s_max_dir_size_kb); APPLY(s_li_wait_mult); APPLY(s_resgid); APPLY(s_resuid); #ifdef CONFIG_EXT4_DEBUG APPLY(s_fc_debug_max_replay); #endif ext4_apply_quota_options(fc, sb); ext4_apply_test_dummy_encryption(ctx, sb); } static int ext4_validate_options(struct fs_context *fc) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; char *usr_qf_name, *grp_qf_name; usr_qf_name = ctx->s_qf_names[USRQUOTA]; grp_qf_name = ctx->s_qf_names[GRPQUOTA]; if (usr_qf_name || grp_qf_name) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } } #endif return 1; } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; switch (sbi->s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } rcu_read_lock(); usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); if (usr_qf_name) seq_show_option(seq, "usrjquota", usr_qf_name); if (grp_qf_name) seq_show_option(seq, "grpjquota", grp_qf_name); rcu_read_unlock(); #endif } static const char *token2str(int token) { const struct fs_parameter_spec *spec; for (spec = ext4_param_specs; spec->name != NULL; spec++) if (spec->opt == token && !spec->type) break; return spec->name; } /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, int nodefs) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) if (sbi->s_sb_block != 1) SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; int opt_2 = m->flags & MOPT_2; unsigned int mount_opt, def_mount_opt; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; if (opt_2) { mount_opt = sbi->s_mount_opt2; def_mount_opt = sbi->s_def_mount_opt2; } else { mount_opt = sbi->s_mount_opt; def_mount_opt = sbi->s_def_mount_opt; } /* skip if same as the default */ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) continue; /* select Opt_noFoo vs Opt_Foo */ if ((want_set && (mount_opt & m->mount_opt) != m->mount_opt) || (!want_set && (mount_opt & m->mount_opt))) continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) SEQ_OPTS_PUTS("errors=continue"); if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) SEQ_OPTS_PUTS("errors=panic"); if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) SEQ_OPTS_PUTS("data=ordered"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) SEQ_OPTS_PUTS("data=writeback"); } if (nodefs || sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); fscrypt_show_test_dummy_encryption(seq, sep, sb); if (sb->s_flags & SB_INLINECRYPT) SEQ_OPTS_PUTS("inlinecrypt"); if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); else SEQ_OPTS_PUTS("dax=always"); } else if (test_opt2(sb, DAX_NEVER)) { SEQ_OPTS_PUTS("dax=never"); } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && !test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=0"); } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=1"); } if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) SEQ_OPTS_PUTS("prefetch_block_bitmaps"); if (ext4_emergency_ro(sb)) SEQ_OPTS_PUTS("emergency_ro"); if (ext4_forced_shutdown(sb)) SEQ_OPTS_PUTS("shutdown"); ext4_show_quota_options(seq, sb); return 0; } static int ext4_show_options(struct seq_file *seq, struct dentry *root) { return _ext4_show_options(seq, root->d_sb, 0); } int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_putc(seq, '\n'); return rc; } static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; goto done; } if (read_only) goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (ext4_get_tstamp(es, s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; size = ext4_flex_group(sbi, ngroup - 1) + 1; if (size <= sbi->s_flex_groups_allocated) return 0; new_groups = kvzalloc(roundup_pow_of_two(size * sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); return -ENOMEM; } for (i = sbi->s_flex_groups_allocated; i < size; i++) { new_groups[i] = kvzalloc(roundup_pow_of_two( sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { for (j = sbi->s_flex_groups_allocated; j < i; j++) kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); return -ENOMEM; } } rcu_read_lock(); old_groups = rcu_dereference(sbi->s_flex_groups); if (old_groups) memcpy(new_groups, old_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups *))); rcu_read_unlock(); rcu_assign_pointer(sbi->s_flex_groups, new_groups); sbi->s_flex_groups_allocated = size; if (old_groups) ext4_kvfree_array_rcu(old_groups); return 0; } static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct flex_groups *fg; ext4_group_t flex_group; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) goto failed; for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), &fg->free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; failed: return 0; } static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_feature_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; } /* old crc16 code */ if (!ext4_has_feature_gdt_csum(sb)) return 0; crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, sbi->s_desc_size - offset); out: return cpu_to_le16(crc); } int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; } void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (!ext4_has_group_desc_csum(sb)) return; gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (i == sbi->s_groups_count - 1 || flexbg_flag) last_block = ext4_blocks_count(sbi->s_es) - 1; else last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); if ((grp == sbi->s_groups_count) && !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap >= sb_block + 1 && block_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap >= sb_block + 1 && inode_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_table >= sb_block + 1 && inode_table <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } } ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } if (NULL != first_not_zeroed) *first_not_zeroed = grp; return 1; } /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk * extent format containers, within a sector_t, and within i_blocks * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * * However there is other limiting factor. We do store extents in the form * of starting block and length, hence the resulting length of the extent * covering maximum file size must fit into on-disk format containers as * well. Given that length is always by 1 unit bigger than max unit (because * we count 0 as well) we have to lower the s_maxbytes by one fs block. * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (blkbits - 9); upper_limit <<= blkbits; } /* * 32-bit extent-start container, ee_block. We lower the maxbytes * by one fs block, so ee_len can cover the extent of maximum file * size */ res = (1LL << 32) - 1; res <<= blkbits; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) res = upper_limit; return res; } /* * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; unsigned int ppb = 1 << (bits - 2); /* * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field * represents total file blocks in 2^32 512-byte sectors == * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (bits - 9); } else { /* * We use 48 bit ext4_inode i_blocks * With EXT4_HUGE_FILE_FL set the i_blocks * represent total number of blocks in * file system block size */ upper_limit = (1LL << 48) - 1; } /* Compute how many blocks we can address by block tree */ res += ppb; res += ppb * ppb; res += ((loff_t)ppb) * ppb * ppb; /* Compute how many metadata blocks are needed */ meta_blocks = 1; meta_blocks += 1 + ppb; meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; /* How many metadata blocks are needed for addressing upper_limit? */ upper_limit -= EXT4_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; upper_limit -= ppb; /* double indirect blocks */ if (upper_limit < ppb * ppb) { meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); res -= meta_blocks; goto check_lfs; } meta_blocks += 1 + ppb; upper_limit -= ppb * ppb; /* tripple indirect blocks for the rest */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); res -= meta_blocks; check_lfs: res <<= bits; if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; return res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; /* * If we have a meta_bg fs with 1k blocks, group 0's GDT is at * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled * on modern mke2fs or blksize > 1k on older mke2fs) then we must * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); } /** * ext4_get_stripe_size: Get the stripe size. * @sbi: In memory super block info * * If we have specified it via mount option, then * use the mount option value. If the value specified at mount time is * greater than the blocks per group use the super block value. * If the super block value is greater than blocks per group return 0. * Allocator needs it be less than blocks per group. * */ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) { unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; /* * If the stripe width is 1, this makes no sense and * we set it to 0 to turn off stripe handling code. */ if (ret <= 1) ret = 0; return ret; } /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); return 0; } if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } if (readonly) return 1; if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= SB_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); return 0; } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ return 1; } /* * This function is called once a day if we have errors logged * on the file system */ static void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; struct super_block *sb = elr->lr_super; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && !ext4_emergency_state(sb) && !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; } } return ret; } for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; break; } if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } if (group >= ngroups) ret = 1; if (!ret) { start_time = ktime_get_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } return ret; } /* * Remove lr_request from the list_request and free the * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { if (!elr) return; list_del(&elr->lr_request); EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } static void ext4_unregister_li_request(struct super_block *sb) { mutex_lock(&ext4_li_mtx); if (!ext4_li_info) { mutex_unlock(&ext4_li_mtx); return; } mutex_lock(&ext4_li_info->li_list_mtx); ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. * When such a fs is found, run the lazy initialization request * (ext4_rn_li_request) and keep track of the time spend in this * function. Based on that time we compute next schedule time of * the request. When walking through the list is complete, compute * next waking time and put itself into sleep. */ static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; BUG_ON(NULL == eli); set_freezable(); cont_thread: while (true) { bool next_wakeup_initialized = false; next_wakeup = 0; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); goto exit_thread; } list_for_each_safe(pos, n, &eli->li_request_list) { int err = 0; int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); if (time_before(jiffies, elr->lr_next_sched)) { if (!next_wakeup_initialized || time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; next_wakeup_initialized = true; } continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { if (sb_start_write_trylock(elr->lr_super)) { progress = 1; /* * We hold sb->s_umount, sb can not * be removed from the list, it is * now safe to drop li_list_mtx */ mutex_unlock(&eli->li_list_mtx); err = ext4_run_li_request(elr); sb_end_write(elr->lr_super); mutex_lock(&eli->li_list_mtx); n = pos->next; } up_read((&elr->lr_super->s_umount)); } /* error, remove the lazy_init job */ if (err) { ext4_remove_li_request(elr); continue; } if (!progress) { elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (!next_wakeup_initialized || time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; next_wakeup_initialized = true; } } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { cond_resched(); continue; } schedule_timeout_interruptible(next_wakeup - cur); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; } } exit_thread: /* * It looks like the request list is empty, but we need * to check it under the li_list_mtx lock, to prevent any * additions into it, and of course we should lock ext4_li_mtx * to atomically free the list and ext4_li_info, because at * this point another ext4 filesystem could be registering * new one. */ mutex_lock(&ext4_li_mtx); mutex_lock(&eli->li_list_mtx); if (!list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); mutex_unlock(&ext4_li_mtx); goto cont_thread; } mutex_unlock(&eli->li_list_mtx); kfree(ext4_li_info); ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); return 0; } static void ext4_clear_request_list(void) { struct list_head *pos, *n; struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); ext4_remove_li_request(elr); } mutex_unlock(&ext4_li_info->li_list_mtx); } static int ext4_run_lazyinit_thread(void) { ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; return 0; } /* * Check whether it make sense to run itable init. thread or not. * If there is at least one uninitialized inode table, return * corresponding group number, else the loop goes through all * groups and return total number of groups. */ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) { ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; if (!ext4_has_group_desc_csum(sb)) return ngroups; for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } return group; } static int ext4_li_info_new(void) { struct ext4_lazy_init *eli = NULL; eli = kzalloc(sizeof(*eli), GFP_KERNEL); if (!eli) return -ENOMEM; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; return 0; } static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) return NULL; elr->lr_super = sb; elr->lr_first_not_zeroed = start; if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; } else { elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* * Randomize first schedule time of the request to * spread the inode table initialization requests * better. */ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; goto out; } if (ext4_emergency_state(sb) || sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) { ret = -ENOMEM; goto out; } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); if (ret) goto out; } mutex_lock(&ext4_li_info->li_list_mtx); list_add(&elr->lr_request, &ext4_li_info->li_request_list); mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; /* * set elr to NULL here since it has been inserted to * the request_list and the removal and free of it is * handled by ext4_clear_request_list from now on. */ elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); if (ret) goto out; } out: mutex_unlock(&ext4_li_mtx); if (ret) kfree(elr); return ret; } /* * We do not need to lock anything since this is called on * module unload. */ static void ext4_destroy_lazyinit_thread(void) { /* * If thread exited earlier * there's nothing to be done. */ if (!ext4_li_info || !ext4_lazyinit_task) return; kthread_stop(ext4_lazyinit_task); } static int set_journal_csum_feature_set(struct super_block *sb) { int ret = 1; int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_feature_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; incompat = 0; } jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | incompat); } else if (test_opt(sb, JOURNAL_CHECKSUM)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; } /* * Note: calculating the overhead so we can be compatible with * historical BSD practice is quite difficult in the face of * clusters/bigalloc. This is because multiple metadata blocks from * different block group can end up in the same allocation cluster. * Calculating the exact overhead in the face of clustered allocation * requires either O(all block bitmaps) in memory or O(number of block * groups**2) in time. We will still calculate the superblock for * older file systems --- and if we come across with a bigalloc file * system with zero in s_overhead_clusters the estimate will be close to * correct especially for very large cluster sizes --- but for newer * file systems, it's better to calculate this figure once at mkfs * time, and store it in the superblock. If the superblock value is * present (even for non-bigalloc file systems), we will use it. */ static int count_overhead(struct super_block *sb, ext4_group_t grp, char *buf) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp; ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) return (has_super + ext4_bg_num_gdb(sb, grp) + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); b = ext4_block_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_table(sb, gdp); if (b >= first_block && b + sbi->s_itb_per_group <= last_block) for (j = 0; j < sbi->s_itb_per_group; j++, b++) { int c = EXT4_B2C(sbi, b - first_block); ext4_set_bit(c, buf); count++; } if (i != grp) continue; s = 0; if (ext4_bg_has_super(sb, grp)) { ext4_set_bit(s++, buf); count++; } j = ext4_bg_num_gdb(sb, grp); if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_error(sb, "Invalid number of block group " "descriptor blocks: %d", j); j = EXT4_BLOCKS_PER_GROUP(sb) - s; } count += j; for (; j > 0; j--) ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; return EXT4_CLUSTERS_PER_GROUP(sb) - ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); } /* * Compute the overhead and stash it in sbi->s_overhead */ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct inode *j_inode; unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; /* * Compute the overhead (FS structures). This is constant * for a given filesystem unless the number of block groups * changes so we cache the previous value until it does. */ /* * All of the blocks before first_data_block are overhead */ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { int blks; blks = count_overhead(sb, i, buf); overhead += blks; if (blks) memset(buf, 0, PAGE_SIZE); cond_resched(); } /* * Add the internal journal blocks whether the journal has been * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); } else { ext4_msg(sb, KERN_ERR, "can't get journal size"); } } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); return 0; } static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. * The space estimates are exact, there are no unwritten extents, * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ if (!ext4_has_feature_extents(sb)) return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ resv_clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static const char *ext4_quota_mode(struct super_block *sb) { #ifdef CONFIG_QUOTA if (!ext4_quota_capable(sb)) return "none"; if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) return "journalled"; else return "writeback"; #else return "disabled"; #endif } static void ext4_setup_csum_trigger(struct super_block *sb, enum ext4_journal_trigger_type type, void (*trigger)( struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size)) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_journal_triggers[type].sb = sb; sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } static void ext4_free_sbi(struct ext4_sb_info *sbi) { if (!sbi) return; kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) { struct ext4_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) goto err_out; sb->s_fs_info = sbi; sbi->s_sb = sb; return sbi; err_out: fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } static void ext4_set_def_opts(struct super_block *sb, struct ext4_super_block *es) { unsigned long def_mount_opts; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_feature_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); /* block_validity enabled by default; disable with noblock_validity */ set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int clustersize; /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); if (ext4_has_feature_bigalloc(sb)) { if (clustersize < sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "fragment/cluster size (%d) != " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#blocks per group too big: %lu", sbi->s_blocks_per_group); return -EINVAL; } sbi->s_cluster_bits = 0; } sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", sbi->s_clusters_per_group); return -EINVAL; } if (sbi->s_blocks_per_group != (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and clusters per group (%lu) inconsistent", sbi->s_blocks_per_group, sbi->s_clusters_per_group); return -EINVAL; } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); return 0; } /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. * @sb: super block * TODO: Later add support for bigalloc */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; if (!bdev_can_atomic_write(bdev)) return; if (!ext4_has_feature_extents(sb)) return; sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); sbi->s_awu_max = min(sb->s_blocksize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", sbi->s_awu_min, sbi->s_awu_max); } else { sbi->s_awu_min = 0; sbi->s_awu_max = 0; } } static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; sbi->s_fc_replay_state.fc_regions_valid = 0; sbi->s_fc_replay_state.fc_modified_inodes = NULL; sbi->s_fc_replay_state.fc_modified_inodes_size = 0; sbi->s_fc_replay_state.fc_modified_inodes_used = 0; } static int ext4_inode_info_init(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); return -EINVAL; } /* * i_atime_extra is the last extra field available for * [acm]times in struct ext4_inode. Checking for that * field should suffice to ensure we have extra space * for all three. */ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { unsigned v, max = (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE); v = le16_to_cpu(es->s_want_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; v = le16_to_cpu(es->s_min_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } return 0; } #if IS_ENABLED(CONFIG_UNICODE) static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); if (!ext4_has_feature_casefold(sb) || sb->s_encoding) return 0; encoding_info = ext4_sb_read_encoding(es); if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return -EINVAL; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; return 0; } #else static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { return 0; } #endif static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Warn if metadata_csum and gdt_csum are both set. */ if (ext4_has_feature_metadata_csum(sb) && ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "unknown checksum algorithm."); return -EINVAL; } ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); return -EFSBADCRC; } /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_feature_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); return -EINVAL; } /* * ea_inode feature uses l_i_version field which is not * available in HURD_COMPAT mode. */ if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); return -EINVAL; } } if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); return -EINVAL; } } if (IS_EXT3_SB(sb)) { if (ext3_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); return -EINVAL; } } /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); return -EINVAL; } if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); return -EINVAL; } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); return -EINVAL; } return 0; } static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; int err; if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); return -EINVAL; } /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); return err; } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); return -EINVAL; } /* * It makes no sense for the first data block to be beyond the end * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); return -EINVAL; } if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && (sbi->s_cluster_ratio == 1)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block is 0 with a 1k block and cluster size"); return -EINVAL; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); return -EINVAL; } sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", le32_to_cpu(es->s_inodes_count), ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); return -EINVAL; } return 0; } static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", le32_to_cpu(es->s_first_meta_bg), db_count); return -EINVAL; } } rcu_assign_pointer(sbi->s_group_desc, kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); return -ENOMEM; } bgl_lock_init(sbi->s_blockgroup_lock); /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); bh = ext4_sb_bread_unmovable(sb, block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); return -EFSCORRUPTED; } return 0; } static int ext4_load_and_init_journal(struct super_block *sb, struct ext4_super_block *es, struct ext4_fs_context *ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) return err; if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto out; } if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); goto out; } if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { ext4_msg(sb, KERN_ERR, "Failed to set fast commit journal feature"); goto out; } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { case 0: /* No mode set, assume a default based on the journal * capabilities: ORDERED_DATA if the journal can * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; } else { set_opt(sb, JOURNAL_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; } break; case EXT4_MOUNT_ORDERED_DATA: case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); goto out; } break; default: break; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); goto out; } set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = ext4_journal_finish_inode_data_buffers; return 0; out: ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " "data=journal disables delayed allocation, " "dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); return -EINVAL; } if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ext4_has_feature_encrypt(sb)) { ext4_msg(sb, KERN_WARNING, "encrypted files will use data=ordered " "instead of data journaling mode"); } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { sb->s_iflags |= SB_I_CGROUPWB; } return 0; } static const char *ext4_has_journal_option(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) return "journal_async_commit"; if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) return "journal_checksum"; if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) return "commit="; if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) return "data="; if (test_opt(sb, DATA_ERR_ABORT)) return "data_err=abort"; return NULL; } static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es; ext4_fsblk_t logical_sb_block; unsigned long offset = 0; struct buffer_head *bh; int ret = -EINVAL; int blocksize; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); return -EINVAL; } /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); return PTR_ERR(bh); } /* * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto out; } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log block size: %u", le32_to_cpu(es->s_log_block_size)); goto out; } if (le32_to_cpu(es->s_log_cluster_size) > (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log cluster size: %u", le32_to_cpu(es->s_log_cluster_size)); goto out; } blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); /* * If the default block size is not the same as the real block size, * we need to reload it. */ if (sb->s_blocksize == blocksize) { *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; } /* * bh must be released before kill_bdev(), otherwise * it won't be freed and its page also. kill_bdev() * is called by sb_set_blocksize(). */ brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); bh = NULL; goto out; } logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); ret = PTR_ERR(bh); bh = NULL; goto out; } es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; out: brelse(bh); return ret; } static int ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; sbi->s_def_hash_version = es->s_def_hash_version; if (sbi->s_def_hash_version > DX_HASH_LAST) { ext4_msg(sb, KERN_ERR, "Invalid default hash set in the superblock"); return -EINVAL; } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) { ext4_msg(sb, KERN_ERR, "SIPHASH is not a valid default hash value"); return -EINVAL; } for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif } } return 0; } static int ext4_block_group_meta_init(struct super_block *sb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int has_huge_files; has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { ext4_msg(sb, KERN_ERR, "unsupported descriptor size %lu", sbi->s_desc_size); return -EINVAL; } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); return -EINVAL; } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); return -EINVAL; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); return 0; } /* * It's hard to get stripe aligned blocks if stripe is not aligned with * cluster, just disable stripe and alert user to simplify code and avoid * stripe aligned allocation which will rarely succeed. */ static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe) { struct ext4_sb_info *sbi = EXT4_SB(sb); return (stripe > 0 && sbi->s_cluster_ratio > 1 && stripe % sbi->s_cluster_ratio != 0); } static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; int needs_recovery; int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; es = sbi->s_es; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); err = ext4_init_metadata_csum(sb, es); if (err) goto failed_mount; ext4_set_def_opts(sb, es); sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; /* * set default s_li_wait_mult for lazyinit, for the case there is * no mount option specified. */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; err = ext4_inode_info_init(sb, es); if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); if (err < 0) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) goto failed_mount; ext4_apply_options(fc, sb); err = ext4_encoding_init(sb, es); if (err) goto failed_mount; err = ext4_check_journal_data_mode(sb); if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; /* HSM events are allowed by default. */ sb->s_iflags |= SB_I_ALLOW_HSM; err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; err = ext4_block_group_meta_init(sb, silent); if (err) goto failed_mount; err = ext4_hash_info_init(sb); if (err) goto failed_mount; err = ext4_handle_clustersize(sb); if (err) goto failed_mount; err = ext4_check_geometry(sb, es); if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) goto failed_mount3; err = ext4_es_register_shrinker(sbi); if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", sbi->s_stripe, sbi->s_cluster_ratio); sbi->s_stripe = 0; } sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode */ sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &ext4_verityops; #endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); spin_lock_init(&sbi->s_bdev_wb_lock); ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto failed_mount3a; } err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { const char *journal_option; /* Nojournal mode, all journal mount options are illegal */ journal_option = ext4_has_journal_option(sb); if (journal_option != NULL) { ext4_msg(sb, KERN_ERR, "can't mount with %s, fs mounted w/o journal", journal_option); goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; } if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); err = -EINVAL; goto failed_mount_wq; } if (ext4_has_feature_ea_inode(sb)) { sbi->s_ea_inode_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); err = -EINVAL; goto failed_mount_wq; } } } /* * Get the # of file system overhead blocks from the * superblock if present. */ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); /* ignore the precalculated value if it is ridiculous */ if (sbi->s_overhead > ext4_blocks_count(es)) sbi->s_overhead = 0; /* * If the bigalloc feature is not enabled recalculating the * overhead doesn't take long, so we might as well just redo * it to make sure we are using the correct value. */ if (!ext4_has_feature_bigalloc(sb)) sbi->s_overhead = 0; if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; } /* * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->rsv_conversion_wq = alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); err = -ENOMEM; goto failed_mount4; } /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. */ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); err = -EFSCORRUPTED; goto failed_mount4; } generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); err = -ENOMEM; goto failed_mount4; } err = ext4_setup_super(sb, es, sb_rdonly(sb)); if (err == -EROFS) { sb->s_flags |= SB_RDONLY; } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); if (test_opt(sb, BLOCK_VALIDITY)) { err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); goto failed_mount4a; } } ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); /* * Enable optimize_scan if number of groups is > threshold. This can be * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) { if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); else clear_opt2(sb, MB_OPTIMIZE_SCAN); } err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount5; } /* * We can only set up the journal commit callback once * mballoc is initialized */ if (sbi->s_journal) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; err = ext4_percpu_param_init(sbi); if (err) goto failed_mount6; if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); err = -ENOMEM; goto failed_mount6; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; err = ext4_init_orphan_info(sb); if (err) goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount8; } #endif /* CONFIG_QUOTA */ /* * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; /* * Update the checksum after updating free space/inode counters and * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect * checksum in the buffer cache until it is written out and * e2fsprogs programs trying to open a file system immediately * after it is mounted can fail. */ ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) goto failed_mount9; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); clear_opt(sb, DISCARD); } if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); /* Register sysfs after all initializations are complete. */ err = ext4_register_sysfs(sb); if (err) goto failed_mount9; return 0; failed_mount9: ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount8: __maybe_unused ext4_release_orphan_info(sb); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); ext4_stop_mmpd(sbi); timer_delete_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); if (sbi->s_journal_bdev_file) { invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi; const char *descr; int ret; sbi = ext4_alloc_sbi(sb); if (!sbi) return -ENOMEM; fc->s_fs_info = sbi; /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); sbi->s_sb_block = 1; /* Default super block location */ if (ctx->spec & EXT4_SPEC_s_sb_block) sbi->s_sb_block = ctx->s_sb_block; ret = __ext4_fill_super(fc, sb); if (ret < 0) goto free_sbi; if (sbi->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) descr = " journalled data mode"; else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) descr = " ordered data mode"; else descr = " writeback data mode"; } else descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " "Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); return 0; free_sbi: ext4_free_sbi(sbi); fc->s_fs_info = NULL; return ret; } static int ext4_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ext4_fill_super); } /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. */ journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; /* * Test for the existence of a valid inode on disk. Bad things * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return ERR_PTR(-EFSCORRUPTED); } if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return ERR_PTR(-EFSCORRUPTED); } ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); return journal_inode; } static int ext4_journal_bmap(journal_t *journal, sector_t *block) { struct ext4_map_blocks map; int ret; if (journal->j_inode == NULL) return 0; map.m_lblk = *block; map.m_len = 1; ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0); if (ret <= 0) { ext4_msg(journal->j_inode->i_sb, KERN_CRIT, "journal bmap failed: block %llu ret %d\n", *block, ret); jbd2_journal_abort(journal, ret ? ret : -EIO); return ret; } *block = map.m_pblk; return 0; } static journal_t *ext4_open_inode_journal(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (IS_ERR(journal_inode)) return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; ext4_init_journal_params(sb, journal); return journal; } static struct file *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; bdev_file = bdev_file_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); return bdev_file; } bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev_file, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); errno = -EINVAL; goto out_bdev; } es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); errno = -EFSCORRUPTED; goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); errno = -EFSCORRUPTED; goto out_bh; } *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); return bdev_file; out_bh: brelse(bh); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); } static journal_t *ext4_open_dev_journal(struct super_block *sb, dev_t j_dev) { journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; struct file *bdev_file; int errno = 0; bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); errno = PTR_ERR(journal); goto out_bdev; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); errno = -EINVAL; goto out_journal; } journal->j_private = sb; EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, struct ext4_super_block *es, unsigned long journal_devnum) { journal_t *journal; unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); dev_t journal_dev; int err = 0; int really_read_only; int journal_dev_ro; if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { ext4_msg(sb, KERN_INFO, "external journal device major/minor " "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); if (journal_inum && journal_dev) { ext4_msg(sb, KERN_ERR, "filesystem has both journal inode and journal device!"); return -EINVAL; } if (journal_inum) { journal = ext4_open_inode_journal(sb, journal_inum); if (IS_ERR(journal)) return PTR_ERR(journal); } else { journal = ext4_open_dev_journal(sb, journal_dev); if (IS_ERR(journal)) return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; if (journal_dev_ro && !sb_rdonly(sb)) { ext4_msg(sb, KERN_ERR, "journal device read-only, try mounting with '-o ro'"); err = -EROFS; goto err_out; } /* * Are we loading a blank journal or performing recovery after a * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ if (ext4_has_feature_journal_needs_recovery(sb)) { if (sb_rdonly(sb)) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); if (really_read_only) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); err = -EROFS; goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!ext4_has_feature_journal_needs_recovery(sb)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); __le16 orig_state; bool changed = false; if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); if (save && memcmp(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN)) { memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); changed = true; } kfree(save); orig_state = es->s_state; es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS); if (orig_state != es->s_state) changed = true; /* Write out restored error information to the superblock */ if (changed && !really_read_only) { int err2; err2 = ext4_commit_super(sb); err = err ? : err2; } } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); goto err_out; } EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { ext4_journal_destroy(EXT4_SB(sb), journal); return err; } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); ext4_commit_super(sb); } if (!really_read_only && journal_inum && journal_inum != le32_to_cpu(es->s_journal_inum)) { es->s_journal_inum = cpu_to_le32(journal_inum); ext4_commit_super(sb); } return 0; err_out: ext4_journal_destroy(EXT4_SB(sb), journal); return err; } /* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ static void ext4_update_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *sbh = sbi->s_sbh; lock_buffer(sbh); /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock * write time when we are mounting the root file system * read/only but we need to replay the journal; at that point, * for people who are east of GMT and who make their clock * tick in localtime for Windows bug-for-bug compatibility, * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(sbi, percpu_counter_sum_positive( &sbi->s_freeclusters_counter))); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &sbi->s_freeinodes_counter)); /* Copy error information to the on-disk superblock */ spin_lock(&sbi->s_error_lock); if (sbi->s_add_error_count > 0) { es->s_state |= cpu_to_le16(EXT4_ERROR_FS); if (!es->s_first_error_time && !es->s_first_error_time_hi) { __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); strtomem_pad(es->s_first_error_func, sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = cpu_to_le32(sbi->s_first_error_ino); es->s_first_error_block = cpu_to_le64(sbi->s_first_error_block); es->s_first_error_errcode = ext4_errno_to_code(sbi->s_first_error_code); } __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); es->s_last_error_errcode = ext4_errno_to_code(sbi->s_last_error_code); /* * Start the daily error reporting function if it hasn't been * started already */ if (!es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); sbi->s_add_error_count = 0; } spin_unlock(&sbi->s_error_lock); ext4_superblock_csum_set(sb); unlock_buffer(sbh); } static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; if (!sbh) return -EINVAL; ext4_update_super(sb); lock_buffer(sbh); /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(sbh)) { unlock_buffer(sbh); return -EIO; } if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the * superblock failed. This could happen because the * USB device was yanked out. Or it could happen to * be a transient write error and maybe the block will * be remapped. Nothing we can do but to retry the * write and hope for the best. */ ext4_msg(sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } get_bh(sbh); /* Clear potential dirty bit if it was journalled update */ clear_buffer_dirty(sbh); sbh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); return -EIO; } return 0; } /* * Have we just finished recovery? If so, and if we are mounting (or * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es) { int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { if (journal != NULL) { ext4_error(sb, "Journal got removed while the fs was " "mounted!"); return -EFSCORRUPTED; } return 0; } jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal, 0); if (err < 0) goto out; if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || ext4_has_feature_orphan_present(sb))) { if (!ext4_orphan_file_empty(sb)) { ext4_error(sb, "Orphan file not empty on read-only fs."); err = -EFSCORRUPTED; goto out; } ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: jbd2_journal_unlock_updates(journal); return err; } /* * If we are mounting (or read-write remounting) a filesystem whose journal * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; if (!ext4_has_feature_journal(sb)) { ext4_error(sb, "Journal got removed while the fs was mounted!"); return -EFSCORRUPTED; } journal = EXT4_SB(sb)->s_journal; /* * Now check for any error status which may have been recorded in the * journal by a prior ext4_error() or ext4_abort() */ j_errno = jbd2_journal_errno(journal); if (j_errno) { char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); j_errno = ext4_commit_super(sb); if (j_errno) return j_errno; ext4_warning(sb, "Marked fs in need of filesystem check."); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } return 0; } /* * Force the running and committing transactions to commit, * and wait on the commit. */ int ext4_force_commit(struct super_block *sb) { return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); ret = ext4_emergency_state(sb); if (unlikely(ret)) return ret; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots */ dquot_writeback_dquots(sb, -1); /* * Data writeback is possible w/o journal transaction, so barrier must * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ if (sbi->s_journal) { target = jbd2_get_latest_transaction(sbi->s_journal); if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) needs_barrier = true; if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) ret = jbd2_log_wait_commit(sbi->s_journal, target); } } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev); if (!ret) ret = err; } return ret; } /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean * state independently. It relies on upper layer to stop all data & metadata * modifications. */ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ jbd2_journal_lock_updates(journal); /* * Don't clear the needs_recovery flag if we failed to * flush the journal. */ error = jbd2_journal_flush(journal, 0); if (error < 0) goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); out: if (journal) /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(journal); return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ static int ext4_unfreeze(struct super_block *sb) { if (ext4_emergency_state(sb)) return 0; if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); return 0; } /* * Structure to save mount options for ext4_remount's benefit */ struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; kuid_t s_resuid; kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; static int __ext4_remount(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; old_opts.s_min_batch_time = sbi->s_min_batch_time; old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { char *qf_name = get_qf_name(sb, sbi, i); old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); return -ENOMEM; } } else old_opts.s_qf_names[i] = NULL; #endif if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { if (sbi->s_journal && sbi->s_journal->j_task->io_context) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; } if ((ctx->spec & EXT4_SPEC_s_stripe) && ext4_is_stripe_incompatible(sb, ctx->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", ctx->s_stripe, sbi->s_cluster_ratio); ctx->s_stripe = 0; } /* * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause * two calls to ext4_should_dioread_nolock() to return inconsistent * values, triggering WARN_ON in ext4_add_complete_io(). we grab * here s_writepages_rwsem to avoid race between writepages ops and * remount. */ alloc_ctx = ext4_writepages_down_write(sb); ext4_apply_options(fc, sb); ext4_writepages_up_write(sb, alloc_ctx); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " "during remount not supported; ignoring"); sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); err = -EINVAL; goto restore_opts; } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dioread_nolock"); err = -EINVAL; goto restore_opts; } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); err = -EINVAL; goto restore_opts; } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); err = -EINVAL; goto restore_opts; } if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && !test_opt(sb, DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); err = -EINVAL; goto restore_opts; } sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); es = sbi->s_es; if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); } /* Flush outstanding errors before changing fs state */ flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_emergency_state(sb)) { err = -EROFS; goto restore_opts; } if (fc->sb_flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount */ sb->s_flags |= SB_RDONLY; /* * OK, test if we are remounting a valid rw partition * readonly, and if so set the rdonly flag and then * mark the partition as valid again. */ if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); if (sbi->s_journal) { /* * We let remount-ro finish even if marking fs * as clean failed... */ ext4_mark_recovery_complete(sb, es); } } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. */ for (g = 0; g < sbi->s_groups_count; g++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EFSBADCRC; goto restore_opts; } } /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, * require a full umount/remount for now. */ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " "umount/remount instead"); err = -EINVAL; goto restore_opts; } /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted * the partition.) */ if (sbi->s_journal) { err = ext4_clear_journal_err(sb, es); if (err) goto restore_opts; } sbi->s_mount_state = (le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) goto restore_opts; sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA enable_quota = 1; #endif } } /* * Handle creation of system zone data early because it can fail. * Releasing of existing data is done when we are sure remount will * succeed. */ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { err = ext4_setup_system_zone(sb); if (err) goto restore_opts; } if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); else if (ext4_has_feature_quota(sb)) { err = ext4_enable_quotas(sb); if (err) goto restore_opts; } } /* Release old quota file names */ for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); /* * Reinitialize lazy itable initialization thread based on * current settings */ if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) ext4_unregister_li_request(sb); else { ext4_group_t first_not_zeroed; first_not_zeroed = ext4_has_uninit_itable(sb); ext4_register_li_request(sb, first_not_zeroed); } if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); /* * Handle aborting the filesystem as the last thing during remount to * avoid obsure errors during remount when some option changes fail to * apply due to shutdown filesystem. */ if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); return 0; restore_opts: /* * If there was a failing r/w to ro transition, we may need to * re-enable quota */ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; ext4_writepages_up_write(sb, alloc_ctx); if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { to_free[i] = get_qf_name(sb, sbi, i); rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } synchronize_rcu(); for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(to_free[i]); #endif if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return err; } static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) return ret; ret = __ext4_remount(fc, sb); if (ret < 0) return ret; ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", &sb->s_uuid, (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } #ifdef CONFIG_QUOTA static int ext4_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) { struct kqid qid; struct dquot *dquot; u64 limit; u64 curblock; qid = make_kqid_projid(projid); dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; if (limit) { uint64_t remaining = 0; curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; if (limit > curblock) remaining = limit - curblock; buf->f_blocks = min(buf->f_blocks, limit); buf->f_bfree = min(buf->f_bfree, remaining); buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); if (limit) { uint64_t remaining = 0; if (limit > dquot->dq_dqb.dqb_curinodes) remaining = limit - dquot->dq_dqb.dqb_curinodes; buf->f_files = min(buf->f_files, limit); buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } #endif static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - (ext4_r_blocks_count(es) + resv_blocks); if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); #endif return 0; } #ifdef CONFIG_QUOTA /* * Helper functions so that transaction is started before we acquire dqio_sem * to keep correct lock ordering of transaction > dqio_sem */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext4_write_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; struct inode *inode; inode = dquot_to_inode(dquot); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to commit dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_acquire_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to acquire dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; bool freeze_protected = false; /* * Trying to sb_start_intwrite() in a running transaction * can result in a deadlock. Further, running transactions * are already protected from freezing. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(dquot->dq_sb); freeze_protected = true; } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); if (freeze_protected) sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to release dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; if (freeze_protected) sb_end_intwrite(dquot->dq_sb); return ret; } static int ext4_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; if (ext4_is_quota_journalled(sb)) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { return dquot_mark_dquot_dirty(dquot); } } static int ext4_write_info(struct super_block *sb, int type) { int ret, err; handle_t *handle; /* Data block + inode block */ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); /* The first argument of lockdep_set_subclass has to be * *exactly* the same as the argument to init_rwsem() --- in * this case, in init_once() --- or lockdep gets unhappy * because the name of the lock is set using the * stringification of the argument to init_rwsem(). */ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, subclass); } /* * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; if (!test_opt(sb, QUOTA)) return -EINVAL; /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) return -EXDEV; /* Quota already enabled for this file? */ if (IS_NOQUOTA(d_inode(path->dentry))) return -EBUSY; /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ if (path->dentry->d_parent != sb->s_root) ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; } else { /* * Clear the flag just in case mount options changed since * last time. */ sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; /* * Set inode flags to prevent userspace from messing with quota * files. If this fails, we return success anyway since quotas * are already enabled and this is not a hard failure. */ inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto unlock_inode; EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); if (err) dquot_quota_off(sb, type); } if (err) lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); return err; } static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) { switch (type) { case USRQUOTA: return qf_inum == EXT4_USR_QUOTA_INO; case GRPQUOTA: return qf_inum == EXT4_GRP_QUOTA_INO; case PRJQUOTA: return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; default: BUG(); } } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { int err; struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); if (!qf_inums[type]) return -EPERM; if (!ext4_check_quota_inum(type, qf_inums[type])) { ext4_error(sb, "Bad quota inum: %lu, type: %d", qf_inums[type], type); return -EUCLEAN; } qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode: %lu, type: %d", qf_inums[type], type); return PTR_ERR(qf_inode); } /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); return err; } /* Enable usage tracking for all quota types. */ int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; bool quota_mopt[EXT4_MAXQUOTAS] = { test_opt(sb, USRQUOTA), test_opt(sb, GRPQUOTA), test_opt(sb, PRJQUOTA), }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d, ino=%lu). " "Please run e2fsck to fix.", type, err, qf_inums[type]); ext4_quotas_off(sb, type); return err; } } } return 0; } static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; /* * When the filesystem was remounted read-only first, we cannot cleanup * inode flags here. Bad luck but people should be using QUOTA feature * these days anyway. */ if (sb_rdonly(sb)) goto out_put; inode_lock(inode); /* * Update modification times of quota files when userspace can * start looking at them. If we fail, we return success anyway since * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_unlock; } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); out_put: lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: return dquot_quota_off(sb, type); } /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else memcpy(data, bh->b_data+offset, tocopy); brelse(bh); offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile (we know the transaction is already started and has * enough credits) */ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); return -EIO; } /* * Since we account only one data block in transaction credits, * then it is impossible to cross a block boundary. */ if (sb->s_blocksize - offset < len) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because not block aligned", (unsigned long long)off, (unsigned long long)len); return -EIO; } do { bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_folio(bh->b_folio); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; } return err ? err : len; } #endif #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext2 (%d)\n", err); } static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } static inline int ext2_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; return 1; } #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif static inline void register_as_ext3(void) { int err = register_filesystem(&ext3_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext3 (%d)\n", err); } static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } static inline int ext3_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext3_incompat_features(sb)) return 0; if (!ext4_has_feature_journal(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; return 1; } static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); if (bdev_file) bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); static int __init ext4_init_fs(void) { int err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; /* Build-time check for flags consistency */ ext4_check_flag_values(); err = ext4_init_es(); if (err) return err; err = ext4_init_pending(); if (err) goto out7; err = ext4_init_post_read_processing(); if (err) goto out6; err = ext4_init_pageio(); if (err) goto out5; err = ext4_init_system_zone(); if (err) goto out4; err = ext4_init_sysfs(); if (err) goto out3; err = ext4_init_mballoc(); if (err) goto out2; err = init_inodecache(); if (err) goto out1; err = ext4_fc_init_dentry_cache(); if (err) goto out05; register_as_ext3(); register_as_ext2(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: unregister_as_ext2(); unregister_as_ext3(); ext4_fc_destroy_dentry_cache(); out05: destroy_inodecache(); out1: ext4_exit_mballoc(); out2: ext4_exit_sysfs(); out3: ext4_exit_system_zone(); out4: ext4_exit_pageio(); out5: ext4_exit_post_read_processing(); out6: ext4_exit_pending(); out7: ext4_exit_es(); return err; } static void __exit ext4_exit_fs(void) { ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); module_init(ext4_init_fs) module_exit(ext4_exit_fs)
84 84 295 294 1 5 5 5 5 5 1 1 1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Christoph Hellwig. * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_da_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_acl.h" #include "xfs_log.h" #include "xfs_xattr.h" #include "xfs_quota.h" #include <linux/posix_acl_xattr.h> /* * Get permission to use log-assisted atomic exchange of file extents. * Callers must not be running any transactions or hold any ILOCKs. */ static inline int xfs_attr_grab_log_assist( struct xfs_mount *mp) { int error = 0; /* xattr update log intent items are already enabled */ if (xfs_is_using_logged_xattrs(mp)) return 0; /* * Check if the filesystem featureset is new enough to set this log * incompat feature bit. Strictly speaking, the minimum requirement is * a V5 filesystem for the superblock field, but we'll require rmap * or reflink to avoid having to deal with really old kernels. */ if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) return -EOPNOTSUPP; /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); if (error) return error; xfs_set_using_logged_xattrs(mp); xfs_warn_experimental(mp, XFS_EXPERIMENTAL_LARP); return 0; } static inline bool xfs_attr_want_log_assist( struct xfs_mount *mp) { #ifdef DEBUG /* Logged xattrs require a V5 super for log_incompat */ return xfs_has_crc(mp) && xfs_globals.larp; #else return false; #endif } /* * Set or remove an xattr, having grabbed the appropriate logging resources * prior to calling libxfs. Callers of this function are only required to * initialize the inode, attr_filter, name, namelen, value, and valuelen fields * of @args. */ int xfs_attr_change( struct xfs_da_args *args, enum xfs_attr_update op) { struct xfs_mount *mp = args->dp->i_mount; int error; if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(args->dp); if (error) return error; /* * We have no control over the attribute names that userspace passes us * to remove, so we have to allow the name lookup prior to attribute * removal to fail as well. */ args->op_flags = XFS_DA_OP_OKNOENT; if (xfs_attr_want_log_assist(mp)) { error = xfs_attr_grab_log_assist(mp); if (error) return error; args->op_flags |= XFS_DA_OP_LOGGED; } args->owner = args->dp->i_ino; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; xfs_attr_sethash(args); /* * Some xattrs must be resistant to allocation failure at ENOSPC, e.g. * creating an inode with ACLs or security attributes requires the * allocation of the xattr holding that information to succeed. Hence * we allow xattrs in the VFS TRUSTED, SYSTEM, POSIX_ACL and SECURITY * (LSM xattr) namespaces to dip into the reserve block pool to allow * manipulation of these xattrs when at ENOSPC. These VFS xattr * namespaces translate to the XFS_ATTR_ROOT and XFS_ATTR_SECURE on-disk * namespaces. * * For most of these cases, these special xattrs will fit in the inode * itself and so consume no extra space or only require temporary extra * space while an overwrite is being made. Hence the use of the reserved * pool is largely to avoid the worst case reservation from preventing * the xattr from being created at ENOSPC. */ return xfs_attr_set(args, op, args->attr_filter & (XFS_ATTR_ROOT | XFS_ATTR_SECURE)); } static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *value, size_t size) { struct xfs_da_args args = { .dp = XFS_I(inode), .attr_filter = handler->flags, .name = name, .namelen = strlen(name), .value = value, .valuelen = size, }; int error; if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) return -EIO; error = xfs_attr_get(&args); if (error) return error; return args.valuelen; } static inline enum xfs_attr_update xfs_xattr_flags_to_op( int flags, const void *value) { if (!value) return XFS_ATTRUPDATE_REMOVE; if (flags & XATTR_CREATE) return XFS_ATTRUPDATE_CREATE; if (flags & XATTR_REPLACE) return XFS_ATTRUPDATE_REPLACE; return XFS_ATTRUPDATE_UPSERT; } static int xfs_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct xfs_da_args args = { .dp = XFS_I(inode), .attr_filter = handler->flags, .name = name, .namelen = strlen(name), .value = (void *)value, .valuelen = size, }; int error; error = xfs_attr_change(&args, xfs_xattr_flags_to_op(flags, value)); if (!error && (handler->flags & XFS_ATTR_ROOT)) xfs_forget_acl(inode, name); return error; } static const struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = 0, /* no flags implies user namespace */ .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = XFS_ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = XFS_ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; const struct xattr_handler * const xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, NULL }; static void __xfs_xattr_put_listent( struct xfs_attr_list_context *context, char *prefix, int prefix_len, unsigned char *name, int namelen) { char *offset; int arraytop; if (context->count < 0 || context->seen_enough) return; if (!context->buffer) goto compute_size; arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ context->seen_enough = 1; return; } offset = context->buffer + context->count; memcpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; compute_size: context->count += prefix_len + namelen + 1; return; } static void xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, unsigned char *name, int namelen, void *value, int valuelen) { char *prefix; int prefix_len; ASSERT(context->count >= 0); /* Don't expose private xattr namespaces. */ if (flags & XFS_ATTR_PRIVATE_NSP_MASK) return; if (flags & XFS_ATTR_ROOT) { #ifdef CONFIG_XFS_POSIX_ACL if (namelen == SGI_ACL_FILE_SIZE && strncmp(name, SGI_ACL_FILE, SGI_ACL_FILE_SIZE) == 0) { __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_ACCESS, strlen(XATTR_POSIX_ACL_ACCESS)); } else if (namelen == SGI_ACL_DEFAULT_SIZE && strncmp(name, SGI_ACL_DEFAULT, SGI_ACL_DEFAULT_SIZE) == 0) { __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_DEFAULT, strlen(XATTR_POSIX_ACL_DEFAULT)); } #endif /* * Only show root namespace entries if we are actually allowed to * see them. */ if (!capable(CAP_SYS_ADMIN)) return; prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; } else if (flags & XFS_ATTR_SECURE) { prefix = XATTR_SECURITY_PREFIX; prefix_len = XATTR_SECURITY_PREFIX_LEN; } else { prefix = XATTR_USER_PREFIX; prefix_len = XATTR_USER_PREFIX_LEN; } __xfs_xattr_put_listent(context, prefix, prefix_len, name, namelen); return; } ssize_t xfs_vn_listxattr( struct dentry *dentry, char *data, size_t size) { struct xfs_attr_list_context context; struct inode *inode = d_inode(dentry); int error; if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) return -EIO; /* * First read the regular on-disk attributes. */ memset(&context, 0, sizeof(context)); context.dp = XFS_I(inode); context.resynch = 1; context.buffer = size ? data : NULL; context.bufsize = size; context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; error = xfs_attr_list(&context); if (error) return error; if (context.count < 0) return -ERANGE; return context.count; }
6 6 203 203 34 34 3 33 4 33 2 33 2 32 34 34 3 33 3 5 38 38 1 2 38 38 38 38 38 32 11 32 1 32 32 10 33 33 33 1 32 32 32 223 223 222 2 221 4 4 4 222 162 137 114 222 34 34 248 222 223 203 202 34 202 3 3 2 1 200 40 40 190 190 34 198 71 45 4 31 7 34 34 198 198 124 103 93 169 151 169 167 12 3 4 9 1 122 124 96 55 46 14 159 138 111 195 195 195 195 195 195 195 195 195 1 135 248 210 136 195 195 195 195 195 74 195 195 27 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_quota.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_error.h" #include "xfs_health.h" /* * Lock order: * * ip->i_lock * qi->qi_tree_lock * dquot->q_qlock (xfs_dqlock() and friends) * dquot->q_flush (xfs_dqflock() and friends) * qi->qi_lru_lock * * If two dquots need to be locked the order is user before group/project, * otherwise by the lowest id first, see xfs_dqlock2. */ struct kmem_cache *xfs_dqtrx_cache; static struct kmem_cache *xfs_dquot_cache; static struct lock_class_key xfs_dquot_group_class; static struct lock_class_key xfs_dquot_project_class; /* Record observations of quota corruption with the health tracking system. */ static void xfs_dquot_mark_sick( struct xfs_dquot *dqp) { struct xfs_mount *mp = dqp->q_mount; switch (dqp->q_type) { case XFS_DQTYPE_USER: xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA); break; case XFS_DQTYPE_GROUP: xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA); break; case XFS_DQTYPE_PROJ: xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA); break; default: ASSERT(0); break; } } /* * Detach the dquot buffer if it's still attached, because we can get called * through dqpurge after a log shutdown. Caller must hold the dqflock or have * otherwise isolated the dquot. */ void xfs_dquot_detach_buf( struct xfs_dquot *dqp) { struct xfs_dq_logitem *qlip = &dqp->q_logitem; struct xfs_buf *bp = NULL; spin_lock(&qlip->qli_lock); if (qlip->qli_item.li_buf) { bp = qlip->qli_item.li_buf; qlip->qli_item.li_buf = NULL; } spin_unlock(&qlip->qli_lock); if (bp) { xfs_buf_lock(bp); list_del_init(&qlip->qli_item.li_bio_list); xfs_buf_relse(bp); } } /* * This is called to free all the memory associated with a dquot */ void xfs_qm_dqdestroy( struct xfs_dquot *dqp) { ASSERT(list_empty(&dqp->q_lru)); ASSERT(dqp->q_logitem.qli_item.li_buf == NULL); kvfree(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); kmem_cache_free(xfs_dquot_cache, dqp); } /* * If default limits are in force, push them into the dquot now. * We overwrite the dquot limits only if they are zero and this * is not the root dquot. */ void xfs_qm_adjust_dqlimits( struct xfs_dquot *dq) { struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_def_quota *defq; int prealloc = 0; ASSERT(dq->q_id); defq = xfs_get_defquota(q, xfs_dquot_type(dq)); if (!dq->q_blk.softlimit) { dq->q_blk.softlimit = defq->blk.soft; prealloc = 1; } if (!dq->q_blk.hardlimit) { dq->q_blk.hardlimit = defq->blk.hard; prealloc = 1; } if (!dq->q_ino.softlimit) dq->q_ino.softlimit = defq->ino.soft; if (!dq->q_ino.hardlimit) dq->q_ino.hardlimit = defq->ino.hard; if (!dq->q_rtb.softlimit) dq->q_rtb.softlimit = defq->rtb.soft; if (!dq->q_rtb.hardlimit) dq->q_rtb.hardlimit = defq->rtb.hard; if (prealloc) xfs_dquot_set_prealloc_limits(dq); } /* Set the expiration time of a quota's grace period. */ time64_t xfs_dquot_set_timeout( struct xfs_mount *mp, time64_t timeout) { struct xfs_quotainfo *qi = mp->m_quotainfo; return clamp_t(time64_t, timeout, qi->qi_expiry_min, qi->qi_expiry_max); } /* Set the length of the default grace period. */ time64_t xfs_dquot_set_grace_period( time64_t grace) { return clamp_t(time64_t, grace, XFS_DQ_GRACE_MIN, XFS_DQ_GRACE_MAX); } /* * Determine if this quota counter is over either limit and set the quota * timers as appropriate. */ static inline void xfs_qm_adjust_res_timer( struct xfs_mount *mp, struct xfs_dquot_res *res, struct xfs_quota_limits *qlim) { ASSERT(res->hardlimit == 0 || res->softlimit <= res->hardlimit); if ((res->softlimit && res->count > res->softlimit) || (res->hardlimit && res->count > res->hardlimit)) { if (res->timer == 0) res->timer = xfs_dquot_set_timeout(mp, ktime_get_real_seconds() + qlim->time); } else { res->timer = 0; } } /* * Check the limits and timers of a dquot and start or reset timers * if necessary. * This gets called even when quota enforcement is OFF, which makes our * life a little less complicated. (We just don't reject any quota * reservations in that case, when enforcement is off). * We also return 0 as the values of the timers in Q_GETQUOTA calls, when * enforcement's off. * In contrast, warnings are a little different in that they don't * 'automatically' get started when limits get exceeded. They do * get reset to zero, however, when we find the count to be under * the soft limit (they are only ever set non-zero via userspace). */ void xfs_qm_adjust_dqtimers( struct xfs_dquot *dq) { struct xfs_mount *mp = dq->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; struct xfs_def_quota *defq; ASSERT(dq->q_id); defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_blk, &defq->blk); xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_ino, &defq->ino); xfs_qm_adjust_res_timer(dq->q_mount, &dq->q_rtb, &defq->rtb); } /* * initialize a buffer full of dquots and log the whole thing */ void xfs_qm_init_dquot_blk( struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp) { struct xfs_mount *mp = tp->t_mountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dqblk *d; xfs_dqid_t curid; unsigned int qflag; unsigned int blftype; int i; ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); switch (type) { case XFS_DQTYPE_USER: qflag = XFS_UQUOTA_CHKD; blftype = XFS_BLF_UDQUOT_BUF; break; case XFS_DQTYPE_PROJ: qflag = XFS_PQUOTA_CHKD; blftype = XFS_BLF_PDQUOT_BUF; break; case XFS_DQTYPE_GROUP: qflag = XFS_GQUOTA_CHKD; blftype = XFS_BLF_GDQUOT_BUF; break; default: ASSERT(0); return; } d = bp->b_addr; /* * ID of the first dquot in the block - id's are zero based. */ curid = id - (id % q->qi_dqperchunk); memset(d, 0, BBTOB(q->qi_dqchunklen)); for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; if (curid > 0 && xfs_has_bigtime(mp)) d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; if (xfs_has_crc(mp)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } } xfs_trans_dquot_buf(tp, bp, blftype); /* * quotacheck uses delayed writes to update all the dquots on disk in an * efficient manner instead of logging the individual dquot changes as * they are made. However if we log the buffer allocated here and crash * after quotacheck while the logged initialisation is still in the * active region of the log, log recovery can replay the dquot buffer * initialisation over the top of the checked dquots and corrupt quota * accounting. * * To avoid this problem, quotacheck cannot log the initialised buffer. * We must still dirty the buffer and write it back before the * allocation transaction clears the log. Therefore, mark the buffer as * ordered instead of logging it directly. This is safe for quotacheck * because it detects and repairs allocated but initialized dquot blocks * in the quota inodes. */ if (!(mp->m_qflags & qflag)) xfs_trans_ordered_buf(tp, bp); else xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } static void xfs_dquot_set_prealloc( struct xfs_dquot_pre *pre, const struct xfs_dquot_res *res) { xfs_qcnt_t space; pre->q_prealloc_hi_wmark = res->hardlimit; pre->q_prealloc_lo_wmark = res->softlimit; space = div_u64(pre->q_prealloc_hi_wmark, 100); if (!pre->q_prealloc_lo_wmark) pre->q_prealloc_lo_wmark = space * 95; pre->q_low_space[XFS_QLOWSP_1_PCNT] = space; pre->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; pre->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } /* * Initialize the dynamic speculative preallocation thresholds. The lo/hi * watermarks correspond to the soft and hard limits by default. If a soft limit * is not specified, we use 95% of the hard limit. */ void xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) { xfs_dquot_set_prealloc(&dqp->q_blk_prealloc, &dqp->q_blk); xfs_dquot_set_prealloc(&dqp->q_rtb_prealloc, &dqp->q_rtb); } /* * Ensure that the given in-core dquot has a buffer on disk backing it, and * return the buffer locked and held. This is called when the bmapi finds a * hole. */ STATIC int xfs_dquot_disk_alloc( struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; struct xfs_trans *tp; struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); int nmaps = 1; int error; trace_xfs_dqalloc(dqp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); if (error) return error; xfs_ilock(quotip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, quotip, 0); if (!xfs_this_quota_on(dqp->q_mount, qtype)) { /* * Return if this type of quotas is turned off while we didn't * have an inode lock */ error = -ESRCH; goto err_cancel; } error = xfs_iext_count_extend(tp, quotip, XFS_DATA_FORK, XFS_IEXT_ADD_NOSPLIT_CNT); if (error) goto err_cancel; /* Create the block mapping. */ error = xfs_bmapi_write(tp, quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, &map, &nmaps); if (error) goto err_cancel; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); /* * Keep track of the blkno to save a lookup later */ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp); if (error) goto err_cancel; bp->b_ops = &xfs_dquot_buf_ops; /* * Make a chunk of dquots out of this buffer and log * the entire thing. */ xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* * Hold the buffer and join it to the dfops so that we'll still own * the buffer when we return to the caller. The buffer disposal on * error must be paid attention to very carefully, as it has been * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota * code when allocating a new dquot record" in 2005, and the later * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep * the buffer locked across the _defer_finish call. We can now do * this correctly with xfs_defer_bjoin. * * Above, we allocated a disk block for the dquot information and used * get_buf to initialize the dquot. If the _defer_finish fails, the old * transaction is gone but the new buffer is not joined or held to any * transaction, so we must _buf_relse it. * * If everything succeeds, the caller of this function is returned a * buffer that is locked and held to the transaction. The caller * is responsible for unlocking any buffer passed back, either * manually or by committing the transaction. On error, the buffer is * released and not passed back. * * Keep the quota inode ILOCKed until after the transaction commit to * maintain the atomicity of bmap/rmap updates. */ xfs_trans_bhold(tp, bp); error = xfs_trans_commit(tp); xfs_iunlock(quotip, XFS_ILOCK_EXCL); if (error) { xfs_buf_relse(bp); return error; } *bpp = bp; return 0; err_cancel: xfs_trans_cancel(tp); xfs_iunlock(quotip, XFS_ILOCK_EXCL); return error; } /* * Read in the in-core dquot's on-disk metadata and return the buffer. * Returns ENOENT to signal a hole. */ STATIC int xfs_dquot_disk_read( struct xfs_mount *mp, struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_bmbt_irec map; struct xfs_buf *bp; xfs_dqtype_t qtype = xfs_dquot_type(dqp); struct xfs_inode *quotip = xfs_quota_inode(mp, qtype); uint lock_mode; int nmaps = 1; int error; lock_mode = xfs_ilock_data_map_shared(quotip); if (!xfs_this_quota_on(mp, qtype)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. */ xfs_iunlock(quotip, lock_mode); return -ESRCH; } /* * Find the block map; no allocations yet */ error = xfs_bmapi_read(quotip, dqp->q_fileoffset, XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, lock_mode); if (error) return error; ASSERT(nmaps == 1); ASSERT(map.br_blockcount >= 1); ASSERT(map.br_startblock != DELAYSTARTBLOCK); if (map.br_startblock == HOLESTARTBLOCK) return -ENOENT; trace_xfs_dqtobp_read(dqp); /* * store the blkno etc so that we don't have to do the * mapping all the time */ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); if (xfs_metadata_is_sick(error)) xfs_dquot_mark_sick(dqp); if (error) { ASSERT(bp == NULL); return error; } ASSERT(xfs_buf_islocked(bp)); xfs_buf_set_ref(bp, XFS_DQUOT_REF); *bpp = bp; return 0; } /* Allocate and initialize everything we need for an incore dquot. */ STATIC struct xfs_dquot * xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type) { struct xfs_dquot *dqp; dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL); dqp->q_type = type; dqp->q_id = id; dqp->q_mount = mp; INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; /* * Offset of dquot in the (fixed sized) dquot chunk. */ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * sizeof(struct xfs_dqblk); /* * Because we want to use a counting completion, complete * the flush completion once to allow a single access to * the flush completion without blocking. */ init_completion(&dqp->q_flush); complete(&dqp->q_flush); /* * Make sure group quotas have a different lock class than user * quotas. */ switch (type) { case XFS_DQTYPE_USER: /* uses the default lock class */ break; case XFS_DQTYPE_GROUP: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); break; case XFS_DQTYPE_PROJ: lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); break; default: ASSERT(0); break; } xfs_qm_dquot_logitem_init(dqp); XFS_STATS_INC(mp, xs_qm_dquot); return dqp; } /* Check the ondisk dquot's id and type match what the incore dquot expects. */ static bool xfs_dquot_check_type( struct xfs_dquot *dqp, struct xfs_disk_dquot *ddqp) { uint8_t ddqp_type; uint8_t dqp_type; ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK; dqp_type = xfs_dquot_type(dqp); if (be32_to_cpu(ddqp->d_id) != dqp->q_id) return false; /* * V5 filesystems always expect an exact type match. V4 filesystems * expect an exact match for user dquots and for non-root group and * project dquots. */ if (xfs_has_crc(dqp->q_mount) || dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0) return ddqp_type == dqp_type; /* * V4 filesystems support either group or project quotas, but not both * at the same time. The non-user quota file can be switched between * group and project quota uses depending on the mount options, which * means that we can encounter the other type when we try to load quota * defaults. Quotacheck will soon reset the entire quota file * (including the root dquot) anyway, but don't log scary corruption * reports to dmesg. */ return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ; } /* Copy the in-core quota fields in from the on-disk buffer. */ STATIC int xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset); struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq; /* * Ensure that we got the type and ID we were looking for. * Everything else was checked by the dquot buffer verifier. */ if (!xfs_dquot_check_type(dqp, ddqp)) { xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, quota %u", __this_address, dqp->q_id); xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); xfs_dquot_mark_sick(dqp); return -EFSCORRUPTED; } /* copy everything from disk dquot to the incore dquot */ dqp->q_type = ddqp->d_type; dqp->q_blk.hardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); dqp->q_blk.softlimit = be64_to_cpu(ddqp->d_blk_softlimit); dqp->q_ino.hardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); dqp->q_ino.softlimit = be64_to_cpu(ddqp->d_ino_softlimit); dqp->q_rtb.hardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); dqp->q_rtb.softlimit = be64_to_cpu(ddqp->d_rtb_softlimit); dqp->q_blk.count = be64_to_cpu(ddqp->d_bcount); dqp->q_ino.count = be64_to_cpu(ddqp->d_icount); dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount); dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer); dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer); dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer); /* * Reservation counters are defined as reservation plus current usage * to avoid having to add every time. */ dqp->q_blk.reserved = dqp->q_blk.count; dqp->q_ino.reserved = dqp->q_ino.count; dqp->q_rtb.reserved = dqp->q_rtb.count; /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); return 0; } /* Copy the in-core quota fields into the on-disk buffer. */ void xfs_dquot_to_disk( struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp) { ddqp->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); ddqp->d_version = XFS_DQUOT_VERSION; ddqp->d_type = dqp->q_type; ddqp->d_id = cpu_to_be32(dqp->q_id); ddqp->d_pad0 = 0; ddqp->d_pad = 0; ddqp->d_blk_hardlimit = cpu_to_be64(dqp->q_blk.hardlimit); ddqp->d_blk_softlimit = cpu_to_be64(dqp->q_blk.softlimit); ddqp->d_ino_hardlimit = cpu_to_be64(dqp->q_ino.hardlimit); ddqp->d_ino_softlimit = cpu_to_be64(dqp->q_ino.softlimit); ddqp->d_rtb_hardlimit = cpu_to_be64(dqp->q_rtb.hardlimit); ddqp->d_rtb_softlimit = cpu_to_be64(dqp->q_rtb.softlimit); ddqp->d_bcount = cpu_to_be64(dqp->q_blk.count); ddqp->d_icount = cpu_to_be64(dqp->q_ino.count); ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count); ddqp->d_bwarns = 0; ddqp->d_iwarns = 0; ddqp->d_rtbwarns = 0; ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer); ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer); ddqp->d_rtbtimer = xfs_dquot_to_disk_ts(dqp, dqp->q_rtb.timer); } /* * Read in the ondisk dquot using dqtobp() then copy it to an incore version, * and release the buffer immediately. If @can_alloc is true, fill any * holes in the on-disk metadata. */ static int xfs_qm_dqread( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **dqpp) { struct xfs_dquot *dqp; struct xfs_buf *bp; int error; dqp = xfs_dquot_alloc(mp, id, type); trace_xfs_dqread(dqp); /* Try to read the buffer, allocating if necessary. */ error = xfs_dquot_disk_read(mp, dqp, &bp); if (error == -ENOENT && can_alloc) error = xfs_dquot_disk_alloc(dqp, &bp); if (error) goto err; /* * At this point we should have a clean locked buffer. Copy the data * to the incore dquot and release the buffer since the incore dquot * has its own locking protocol so we needn't tie up the buffer any * further. */ ASSERT(xfs_buf_islocked(bp)); error = xfs_dquot_from_disk(dqp, bp); xfs_buf_relse(bp); if (error) goto err; *dqpp = dqp; return error; err: trace_xfs_dqread_fail(dqp); xfs_qm_dqdestroy(dqp); *dqpp = NULL; return error; } /* * Advance to the next id in the current chunk, or if at the * end of the chunk, skip ahead to first id in next allocated chunk * using the SEEK_DATA interface. */ static int xfs_dq_get_next_id( struct xfs_mount *mp, xfs_dqtype_t type, xfs_dqid_t *id) { struct xfs_inode *quotip = xfs_quota_inode(mp, type); xfs_dqid_t next_id = *id + 1; /* simple advance */ uint lock_flags; struct xfs_bmbt_irec got; struct xfs_iext_cursor cur; xfs_fsblock_t start; int error = 0; /* If we'd wrap past the max ID, stop */ if (next_id < *id) return -ENOENT; /* If new ID is within the current chunk, advancing it sufficed */ if (next_id % mp->m_quotainfo->qi_dqperchunk) { *id = next_id; return 0; } /* Nope, next_id is now past the current chunk, so find the next one */ start = (xfs_fsblock_t)next_id / mp->m_quotainfo->qi_dqperchunk; lock_flags = xfs_ilock_data_map_shared(quotip); error = xfs_iread_extents(NULL, quotip, XFS_DATA_FORK); if (error) return error; if (xfs_iext_lookup_extent(quotip, &quotip->i_df, start, &cur, &got)) { /* contiguous chunk, bump startoff for the id calculation */ if (got.br_startoff < start) got.br_startoff = start; *id = got.br_startoff * mp->m_quotainfo->qi_dqperchunk; } else { error = -ENOENT; } xfs_iunlock(quotip, lock_flags); return error; } /* * Look up the dquot in the in-core cache. If found, the dquot is returned * locked and ready to go. */ static struct xfs_dquot * xfs_qm_dqget_cache_lookup( struct xfs_mount *mp, struct xfs_quotainfo *qi, struct radix_tree_root *tree, xfs_dqid_t id) { struct xfs_dquot *dqp; restart: mutex_lock(&qi->qi_tree_lock); dqp = radix_tree_lookup(tree, id); if (!dqp) { mutex_unlock(&qi->qi_tree_lock); XFS_STATS_INC(mp, xs_qm_dqcachemisses); return NULL; } xfs_dqlock(dqp); if (dqp->q_flags & XFS_DQFLAG_FREEING) { xfs_dqunlock(dqp); mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); delay(1); goto restart; } dqp->q_nrefs++; mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); XFS_STATS_INC(mp, xs_qm_dqcachehits); return dqp; } /* * Try to insert a new dquot into the in-core cache. If an error occurs the * caller should throw away the dquot and start over. Otherwise, the dquot * is returned locked (and held by the cache) as if there had been a cache * hit. * * The insert needs to be done under memalloc_nofs context because the radix * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in * memory reclaim when freeing unused dquots, so we cannot have the radix tree * node allocation recursing into filesystem reclaim whilst we hold the * qi_tree_lock. */ static int xfs_qm_dqget_cache_insert( struct xfs_mount *mp, struct xfs_quotainfo *qi, struct radix_tree_root *tree, xfs_dqid_t id, struct xfs_dquot *dqp) { unsigned int nofs_flags; int error; nofs_flags = memalloc_nofs_save(); mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ trace_xfs_dqget_dup(dqp); goto out_unlock; } /* Return a locked dquot to the caller, with a reference taken. */ xfs_dqlock(dqp); dqp->q_nrefs = 1; qi->qi_dquots++; out_unlock: mutex_unlock(&qi->qi_tree_lock); memalloc_nofs_restore(nofs_flags); return error; } /* Check our input parameters. */ static int xfs_qm_dqget_checks( struct xfs_mount *mp, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) return -ESRCH; return 0; case XFS_DQTYPE_GROUP: if (!XFS_IS_GQUOTA_ON(mp)) return -ESRCH; return 0; case XFS_DQTYPE_PROJ: if (!XFS_IS_PQUOTA_ON(mp)) return -ESRCH; return 0; default: WARN_ON_ONCE(0); return -EINVAL; } } /* * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a * locked dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; error = xfs_qm_dqget_checks(mp, type); if (error) return error; restart: dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); if (dqp) { *O_dqpp = dqp; return 0; } error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); if (error) return error; error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); if (error) { /* * Duplicate found. Just throw away the new dquot and start * over. */ xfs_qm_dqdestroy(dqp); XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return 0; } /* * Given a dquot id and type, read and initialize a dquot from the on-disk * metadata. This function is only for use during quota initialization so * it ignores the dquot cache assuming that the dquot shrinker isn't set up. * The caller is responsible for _qm_dqdestroy'ing the returned dquot. */ int xfs_qm_dqget_uncached( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp) { int error; error = xfs_qm_dqget_checks(mp, type); if (error) return error; return xfs_qm_dqread(mp, id, type, 0, dqpp); } /* Return the quota id for a given inode and type. */ xfs_dqid_t xfs_qm_id_for_quotatype( struct xfs_inode *ip, xfs_dqtype_t type) { switch (type) { case XFS_DQTYPE_USER: return i_uid_read(VFS_I(ip)); case XFS_DQTYPE_GROUP: return i_gid_read(VFS_I(ip)); case XFS_DQTYPE_PROJ: return ip->i_projid; } ASSERT(0); return 0; } /* * Return the dquot for a given inode and type. If @can_alloc is true, then * allocate blocks if needed. The inode's ILOCK must be held and it must not * have already had an inode attached. */ int xfs_qm_dqget_inode( struct xfs_inode *ip, xfs_dqtype_t type, bool can_alloc, struct xfs_dquot **O_dqpp) { struct xfs_mount *mp = ip->i_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; xfs_dqid_t id; int error; error = xfs_qm_dqget_checks(mp, type); if (error) return error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_dquot(ip, type) == NULL); ASSERT(!xfs_is_metadir_inode(ip)); id = xfs_qm_id_for_quotatype(ip, type); restart: dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); if (dqp) { *O_dqpp = dqp; return 0; } /* * Dquot cache miss. We don't want to keep the inode lock across * a (potential) disk read. Also we don't want to deal with the lock * ordering between quotainode and this inode. OTOH, dropping the inode * lock here means dealing with a chown that can happen before * we re-acquire the lock. */ xfs_iunlock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) return error; /* * A dquot could be attached to this inode by now, since we had * dropped the ilock. */ if (xfs_this_quota_on(mp, type)) { struct xfs_dquot *dqp1; dqp1 = xfs_inode_dquot(ip, type); if (dqp1) { xfs_qm_dqdestroy(dqp); dqp = dqp1; xfs_dqlock(dqp); goto dqret; } } else { /* inode stays locked on return */ xfs_qm_dqdestroy(dqp); return -ESRCH; } error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); if (error) { /* * Duplicate found. Just throw away the new dquot and start * over. */ xfs_qm_dqdestroy(dqp); XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } dqret: xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return 0; } /* * Starting at @id and progressing upwards, look for an initialized incore * dquot, lock it, and return it. */ int xfs_qm_dqget_next( struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp) { struct xfs_dquot *dqp; int error = 0; *dqpp = NULL; for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) { error = xfs_qm_dqget(mp, id, type, false, &dqp); if (error == -ENOENT) continue; else if (error != 0) break; if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { *dqpp = dqp; return 0; } xfs_qm_dqput(dqp); } return error; } /* * Release a reference to the dquot (decrement ref-count) and unlock it. * * If there is a group quota attached to this dquot, carefully release that * too without tripping over deadlocks'n'stuff. */ void xfs_qm_dqput( struct xfs_dquot *dqp) { ASSERT(dqp->q_nrefs > 0); ASSERT(XFS_DQ_IS_LOCKED(dqp)); trace_xfs_dqput(dqp); if (--dqp->q_nrefs == 0) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; trace_xfs_dqput_free(dqp); if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); } /* * Release a dquot. Flush it if dirty, then dqput() it. * dquot must not be locked. */ void xfs_qm_dqrele( struct xfs_dquot *dqp) { if (!dqp) return; trace_xfs_dqrele(dqp); xfs_dqlock(dqp); /* * We don't care to flush it if the dquot is dirty here. * That will create stutters that we want to avoid. * Instead we do a delayed write when we try to reclaim * a dirty dquot. Also xfs_sync will take part of the burden... */ xfs_qm_dqput(dqp); } /* * This is the dquot flushing I/O completion routine. It is called * from interrupt level when the buffer containing the dquot is * flushed to disk. It is responsible for removing the dquot logitem * from the AIL if it has not been re-logged, and unlocking the dquot's * flush lock. This behavior is very similar to that of inodes.. */ static void xfs_qm_dqflush_done( struct xfs_log_item *lip) { struct xfs_dq_logitem *qlip = container_of(lip, struct xfs_dq_logitem, qli_item); struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; struct xfs_buf *bp = NULL; xfs_lsn_t tail_lsn; /* * We only want to pull the item from the AIL if its * location in the log has not changed since we started the flush. * Thus, we only bother if the dquot's lsn has * not changed. First we check the lsn outside the lock * since it's cheaper, and then we recheck while * holding the lock before removing the dquot from the AIL. */ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && (lip->li_lsn == qlip->qli_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags))) { spin_lock(&ailp->ail_lock); xfs_clear_li_failed(lip); if (lip->li_lsn == qlip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } else { spin_unlock(&ailp->ail_lock); } } /* * If this dquot hasn't been dirtied since initiating the last dqflush, * release the buffer reference. We already unlinked this dquot item * from the buffer. */ spin_lock(&qlip->qli_lock); if (!qlip->qli_dirty) { bp = lip->li_buf; lip->li_buf = NULL; } spin_unlock(&qlip->qli_lock); if (bp) xfs_buf_rele(bp); /* * Release the dq's flush lock since we're done with it. */ xfs_dqfunlock(dqp); } void xfs_buf_dquot_iodone( struct xfs_buf *bp) { struct xfs_log_item *lip, *n; list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { list_del_init(&lip->li_bio_list); xfs_qm_dqflush_done(lip); } } /* Check incore dquot for errors before we flush. */ static xfs_failaddr_t xfs_qm_dqflush_check( struct xfs_dquot *dqp) { xfs_dqtype_t type = xfs_dquot_type(dqp); if (type != XFS_DQTYPE_USER && type != XFS_DQTYPE_GROUP && type != XFS_DQTYPE_PROJ) return __this_address; if (dqp->q_id == 0) return NULL; if (dqp->q_blk.softlimit && dqp->q_blk.count > dqp->q_blk.softlimit && !dqp->q_blk.timer) return __this_address; if (dqp->q_ino.softlimit && dqp->q_ino.count > dqp->q_ino.softlimit && !dqp->q_ino.timer) return __this_address; if (dqp->q_rtb.softlimit && dqp->q_rtb.count > dqp->q_rtb.softlimit && !dqp->q_rtb.timer) return __this_address; /* bigtime flag should never be set on root dquots */ if (dqp->q_type & XFS_DQTYPE_BIGTIME) { if (!xfs_has_bigtime(dqp->q_mount)) return __this_address; if (dqp->q_id == 0) return __this_address; } return NULL; } /* * Get the buffer containing the on-disk dquot. * * Requires dquot flush lock, will clear the dirty flag, delete the quota log * item from the AIL, and shut down the system if something goes wrong. */ static int xfs_dquot_read_buf( struct xfs_trans *tp, struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp = NULL; int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); if (xfs_metadata_is_sick(error)) xfs_dquot_mark_sick(dqp); if (error) goto out_abort; *bpp = bp; return 0; out_abort: dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(&dqp->q_logitem.qli_item, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } /* * Attach a dquot buffer to this dquot to avoid allocating a buffer during a * dqflush, since dqflush can be called from reclaim context. Caller must hold * the dqlock. */ int xfs_dquot_attach_buf( struct xfs_trans *tp, struct xfs_dquot *dqp) { struct xfs_dq_logitem *qlip = &dqp->q_logitem; struct xfs_log_item *lip = &qlip->qli_item; int error; spin_lock(&qlip->qli_lock); if (!lip->li_buf) { struct xfs_buf *bp = NULL; spin_unlock(&qlip->qli_lock); error = xfs_dquot_read_buf(tp, dqp, &bp); if (error) return error; /* * Hold the dquot buffer so that we retain our ref to it after * detaching it from the transaction, then give that ref to the * dquot log item so that the AIL does not have to read the * dquot buffer to push this item. */ xfs_buf_hold(bp); xfs_trans_brelse(tp, bp); spin_lock(&qlip->qli_lock); lip->li_buf = bp; } qlip->qli_dirty = true; spin_unlock(&qlip->qli_lock); return 0; } /* * Get a new reference the dquot buffer attached to this dquot for a dqflush * operation. * * Returns 0 and a NULL bp if none was attached to the dquot; 0 and a locked * bp; or -EAGAIN if the buffer could not be locked. */ int xfs_dquot_use_attached_buf( struct xfs_dquot *dqp, struct xfs_buf **bpp) { struct xfs_buf *bp = dqp->q_logitem.qli_item.li_buf; /* * A NULL buffer can happen if the dquot dirty flag was set but the * filesystem shut down before transaction commit happened. In that * case we're not going to flush anyway. */ if (!bp) { ASSERT(xfs_is_shutdown(dqp->q_mount)); *bpp = NULL; return 0; } if (!xfs_buf_trylock(bp)) return -EAGAIN; xfs_buf_hold(bp); *bpp = bp; return 0; } /* * Write a modified dquot to disk. * The dquot must be locked and the flush lock too taken by caller. * The flush lock will not be unlocked until the dquot reaches the disk, * but the dquot is free to be unlocked and modified by the caller * in the interim. Dquot is still locked on return. This behavior is * identical to that of inodes. */ int xfs_qm_dqflush( struct xfs_dquot *dqp, struct xfs_buf *bp) { struct xfs_mount *mp = dqp->q_mount; struct xfs_dq_logitem *qlip = &dqp->q_logitem; struct xfs_log_item *lip = &qlip->qli_item; struct xfs_dqblk *dqblk; xfs_failaddr_t fa; int error; ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); trace_xfs_dqflush(dqp); xfs_qm_dqunpin_wait(dqp); fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", dqp->q_id, fa); xfs_dquot_mark_sick(dqp); error = -EFSCORRUPTED; goto out_abort; } /* Flush the incore dquot to the ondisk buffer. */ dqblk = xfs_buf_offset(bp, dqp->q_bufoffset); xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* * Clear the dirty field and remember the flush lsn for later use. */ dqp->q_flags &= ~XFS_DQFLAG_DIRTY; /* * We hold the dquot lock, so nobody can dirty it while we're * scheduling the write out. Clear the dirty-since-flush flag. */ spin_lock(&qlip->qli_lock); qlip->qli_dirty = false; spin_unlock(&qlip->qli_lock); xfs_trans_ail_copy_lsn(mp->m_ail, &qlip->qli_flush_lsn, &lip->li_lsn); /* * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. * * We also calculate the CRC here so that the on-disk dquot in the * buffer always has a valid CRC. This ensures there is no possibility * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_has_crc(mp)) { dqblk->dd_lsn = cpu_to_be64(lip->li_lsn); xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } /* * Attach the dquot to the buffer so that we can remove this dquot from * the AIL and release the flush lock once the dquot is synced to disk. */ bp->b_iodone = xfs_buf_dquot_iodone; list_add_tail(&lip->li_bio_list, &bp->b_li_list); /* * If the buffer is pinned then push on the log so we won't * get stuck waiting in the write for too long. */ if (xfs_buf_ispinned(bp)) { trace_xfs_dqflush_force(dqp); xfs_log_force(mp, 0); } trace_xfs_dqflush_done(dqp); return 0; out_abort: dqp->q_flags &= ~XFS_DQFLAG_DIRTY; xfs_trans_ail_delete(lip, 0); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); xfs_dqfunlock(dqp); return error; } /* * Lock two xfs_dquot structures. * * To avoid deadlocks we always lock the quota structure with * the lowerd id first. */ void xfs_dqlock2( struct xfs_dquot *d1, struct xfs_dquot *d2) { if (d1 && d2) { ASSERT(d1 != d2); if (d1->q_id > d2->q_id) { mutex_lock(&d2->q_qlock); mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); } else { mutex_lock(&d1->q_qlock); mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); } } else if (d1) { mutex_lock(&d1->q_qlock); } else if (d2) { mutex_lock(&d2->q_qlock); } } static int xfs_dqtrx_cmp( const void *a, const void *b) { const struct xfs_dqtrx *qa = a; const struct xfs_dqtrx *qb = b; if (qa->qt_dquot->q_id > qb->qt_dquot->q_id) return 1; if (qa->qt_dquot->q_id < qb->qt_dquot->q_id) return -1; return 0; } void xfs_dqlockn( struct xfs_dqtrx *q) { unsigned int i; BUILD_BUG_ON(XFS_QM_TRANS_MAXDQS > MAX_LOCKDEP_SUBCLASSES); /* Sort in order of dquot id, do not allow duplicates */ for (i = 0; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) { unsigned int j; for (j = 0; j < i; j++) ASSERT(q[i].qt_dquot != q[j].qt_dquot); } if (i == 0) return; sort(q, i, sizeof(struct xfs_dqtrx), xfs_dqtrx_cmp, NULL); mutex_lock(&q[0].qt_dquot->q_qlock); for (i = 1; i < XFS_QM_TRANS_MAXDQS && q[i].qt_dquot != NULL; i++) mutex_lock_nested(&q[i].qt_dquot->q_qlock, XFS_QLOCK_NESTED + i - 1); } int __init xfs_qm_init(void) { xfs_dquot_cache = kmem_cache_create("xfs_dquot", sizeof(struct xfs_dquot), 0, 0, NULL); if (!xfs_dquot_cache) goto out; xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx", sizeof(struct xfs_dquot_acct), 0, 0, NULL); if (!xfs_dqtrx_cache) goto out_free_dquot_cache; return 0; out_free_dquot_cache: kmem_cache_destroy(xfs_dquot_cache); out: return -ENOMEM; } void xfs_qm_exit(void) { kmem_cache_destroy(xfs_dqtrx_cache); kmem_cache_destroy(xfs_dquot_cache); }
13 12 12 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 /* * linux/fs/nls/mac-greek.c * * Charset macgreek translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ /* * COPYRIGHT AND PERMISSION NOTICE * * Copyright 1991-2012 Unicode, Inc. All rights reserved. Distributed under * the Terms of Use in http://www.unicode.org/copyright.html. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of the Unicode data files and any associated documentation (the "Data * Files") or Unicode software and any associated documentation (the * "Software") to deal in the Data Files or Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, and/or sell copies of the Data Files or Software, and * to permit persons to whom the Data Files or Software are furnished to do * so, provided that (a) the above copyright notice(s) and this permission * notice appear with all copies of the Data Files or Software, (b) both the * above copyright notice(s) and this permission notice appear in associated * documentation, and (c) there is clear notice in each modified Data File or * in the Software as well as in the documentation associated with the Data * File(s) or Software that the data or software has been modified. * * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THE DATA FILES OR SOFTWARE. * * Except as contained in this notice, the name of a copyright holder shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in these Data Files or Software without prior written * authorization of the copyright holder. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00 */ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10 */ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20 */ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30 */ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40 */ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50 */ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60 */ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70 */ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80 */ 0x00c4, 0x00b9, 0x00b2, 0x00c9, 0x00b3, 0x00d6, 0x00dc, 0x0385, 0x00e0, 0x00e2, 0x00e4, 0x0384, 0x00a8, 0x00e7, 0x00e9, 0x00e8, /* 0x90 */ 0x00ea, 0x00eb, 0x00a3, 0x2122, 0x00ee, 0x00ef, 0x2022, 0x00bd, 0x2030, 0x00f4, 0x00f6, 0x00a6, 0x20ac, 0x00f9, 0x00fb, 0x00fc, /* 0xa0 */ 0x2020, 0x0393, 0x0394, 0x0398, 0x039b, 0x039e, 0x03a0, 0x00df, 0x00ae, 0x00a9, 0x03a3, 0x03aa, 0x00a7, 0x2260, 0x00b0, 0x00b7, /* 0xb0 */ 0x0391, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x0392, 0x0395, 0x0396, 0x0397, 0x0399, 0x039a, 0x039c, 0x03a6, 0x03ab, 0x03a8, 0x03a9, /* 0xc0 */ 0x03ac, 0x039d, 0x00ac, 0x039f, 0x03a1, 0x2248, 0x03a4, 0x00ab, 0x00bb, 0x2026, 0x00a0, 0x03a5, 0x03a7, 0x0386, 0x0388, 0x0153, /* 0xd0 */ 0x2013, 0x2015, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x0389, 0x038a, 0x038c, 0x038e, 0x03ad, 0x03ae, 0x03af, 0x03cc, 0x038f, /* 0xe0 */ 0x03cd, 0x03b1, 0x03b2, 0x03c8, 0x03b4, 0x03b5, 0x03c6, 0x03b3, 0x03b7, 0x03b9, 0x03be, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf, /* 0xf0 */ 0x03c0, 0x03ce, 0x03c1, 0x03c3, 0x03c4, 0x03b8, 0x03c9, 0x03c2, 0x03c7, 0x03c5, 0x03b6, 0x03ca, 0x03cb, 0x0390, 0x03b0, 0x00ad, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xca, 0x00, 0x00, 0x92, 0x00, 0xb4, 0x9b, 0xac, /* 0xa0-0xa7 */ 0x8c, 0xa9, 0x00, 0xc7, 0xc2, 0xff, 0xa8, 0x00, /* 0xa8-0xaf */ 0xae, 0xb1, 0x82, 0x84, 0x00, 0x00, 0x00, 0xaf, /* 0xb0-0xb7 */ 0x00, 0x81, 0x00, 0xc8, 0x00, 0x97, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x83, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x85, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0xa7, /* 0xd8-0xdf */ 0x88, 0x00, 0x89, 0x00, 0x8a, 0x00, 0x00, 0x8d, /* 0xe0-0xe7 */ 0x8f, 0x8e, 0x90, 0x91, 0x00, 0x00, 0x94, 0x95, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x99, 0x00, 0x9a, 0xd6, /* 0xf0-0xf7 */ 0x00, 0x9d, 0x00, 0x9e, 0x9f, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page03[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x8b, 0x87, 0xcd, 0x00, /* 0x80-0x87 */ 0xce, 0xd7, 0xd8, 0x00, 0xd9, 0x00, 0xda, 0xdf, /* 0x88-0x8f */ 0xfd, 0xb0, 0xb5, 0xa1, 0xa2, 0xb6, 0xb7, 0xb8, /* 0x90-0x97 */ 0xa3, 0xb9, 0xba, 0xa4, 0xbb, 0xc1, 0xa5, 0xc3, /* 0x98-0x9f */ 0xa6, 0xc4, 0x00, 0xaa, 0xc6, 0xcb, 0xbc, 0xcc, /* 0xa0-0xa7 */ 0xbe, 0xbf, 0xab, 0xbd, 0xc0, 0xdb, 0xdc, 0xdd, /* 0xa8-0xaf */ 0xfe, 0xe1, 0xe2, 0xe7, 0xe4, 0xe5, 0xfa, 0xe8, /* 0xb0-0xb7 */ 0xf5, 0xe9, 0xeb, 0xec, 0xed, 0xee, 0xea, 0xef, /* 0xb8-0xbf */ 0xf0, 0xf2, 0xf7, 0xf3, 0xf4, 0xf9, 0xe6, 0xf8, /* 0xc0-0xc7 */ 0xe3, 0xf6, 0xfb, 0xfc, 0xde, 0xe0, 0xf1, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0xd0, 0x00, 0xd1, 0x00, 0x00, /* 0x10-0x17 */ 0xd4, 0xd5, 0x00, 0x00, 0xd2, 0xd3, 0x00, 0x00, /* 0x18-0x1f */ 0xa0, 0x00, 0x96, 0x00, 0x00, 0x00, 0xc9, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page21[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xad, 0x00, 0x00, 0x00, 0xb2, 0xb3, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, page03, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, page21, page22, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x00-0x07 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x08-0x0f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x10-0x17 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x18-0x1f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x20-0x27 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x28-0x2f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x30-0x37 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x38-0x3f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x40-0x47 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x48-0x4f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x50-0x57 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x58-0x5f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x60-0x67 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x68-0x6f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x70-0x77 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x78-0x7f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x80-0x87 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x88-0x8f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x90-0x97 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x98-0x9f */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa0-0xa7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xa8-0xaf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb0-0xb7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xb8-0xbf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc0-0xc7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xc8-0xcf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd0-0xd7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xd8-0xdf */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe0-0xe7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xe8-0xef */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf0-0xf7 */ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "macgreek", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_macgreek(void) { return register_nls(&table); } static void __exit exit_nls_macgreek(void) { unregister_nls(&table); } module_init(init_nls_macgreek) module_exit(exit_nls_macgreek) MODULE_DESCRIPTION("NLS Codepage macgreek"); MODULE_LICENSE("Dual BSD/GPL");
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 // SPDX-License-Identifier: GPL-2.0 /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2021 Intel Corporation * Copyright 2023 NXP */ #include <linux/property.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "hci_codec.h" #include "hci_debugfs.h" #include "smp.h" #include "eir.h" #include "msft.h" #include "aosp.h" #include "leds.h" static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode, struct sk_buff *skb) { bt_dev_dbg(hdev, "result 0x%2.2x", result); if (hdev->req_status != HCI_REQ_PEND) return; hdev->req_result = result; hdev->req_status = HCI_REQ_DONE; /* Free the request command so it is not used as response */ kfree_skb(hdev->req_skb); hdev->req_skb = NULL; if (skb) { struct sock *sk = hci_skb_sk(skb); /* Drop sk reference if set */ if (sk) sock_put(sk); hdev->req_rsp = skb_get(skb); } wake_up_interruptible(&hdev->req_wait_q); } struct sk_buff *hci_cmd_sync_alloc(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, struct sock *sk) { int len = HCI_COMMAND_HDR_SIZE + plen; struct hci_command_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(len, GFP_ATOMIC); if (!skb) return NULL; hdr = skb_put(skb, HCI_COMMAND_HDR_SIZE); hdr->opcode = cpu_to_le16(opcode); hdr->plen = plen; if (plen) skb_put_data(skb, param, plen); bt_dev_dbg(hdev, "skb len %d", skb->len); hci_skb_pkt_type(skb) = HCI_COMMAND_PKT; hci_skb_opcode(skb) = opcode; /* Grab a reference if command needs to be associated with a sock (e.g. * likely mgmt socket that initiated the command). */ if (sk) { hci_skb_sk(skb) = sk; sock_hold(sk); } return skb; } static void hci_cmd_sync_add(struct hci_request *req, u16 opcode, u32 plen, const void *param, u8 event, struct sock *sk) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); /* If an error occurred during request building, there is no point in * queueing the HCI command. We can simply return. */ if (req->err) return; skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, sk); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); req->err = -ENOMEM; return; } if (skb_queue_empty(&req->cmd_q)) bt_cb(skb)->hci.req_flags |= HCI_REQ_START; hci_skb_event(skb) = event; skb_queue_tail(&req->cmd_q, skb); } static int hci_req_sync_run(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; unsigned long flags; bt_dev_dbg(hdev, "length %u", skb_queue_len(&req->cmd_q)); /* If an error occurred during request building, remove all HCI * commands queued on the HCI request queue. */ if (req->err) { skb_queue_purge(&req->cmd_q); return req->err; } /* Do not allow empty requests */ if (skb_queue_empty(&req->cmd_q)) return -ENODATA; skb = skb_peek_tail(&req->cmd_q); bt_cb(skb)->hci.req_complete_skb = hci_cmd_sync_complete; bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB; spin_lock_irqsave(&hdev->cmd_q.lock, flags); skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q); spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } static void hci_request_init(struct hci_request *req, struct hci_dev *hdev) { skb_queue_head_init(&req->cmd_q); req->hdev = hdev; req->err = 0; } /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk) { struct hci_request req; struct sk_buff *skb; int err = 0; bt_dev_dbg(hdev, "Opcode 0x%4.4x", opcode); hci_request_init(&req, hdev); hci_cmd_sync_add(&req, opcode, plen, param, event, sk); hdev->req_status = HCI_REQ_PEND; err = hci_req_sync_run(&req); if (err < 0) return ERR_PTR(err); err = wait_event_interruptible_timeout(hdev->req_wait_q, hdev->req_status != HCI_REQ_PEND, timeout); if (err == -ERESTARTSYS) return ERR_PTR(-EINTR); switch (hdev->req_status) { case HCI_REQ_DONE: err = -bt_to_errno(hdev->req_result); break; case HCI_REQ_CANCELED: err = -hdev->req_result; break; default: err = -ETIMEDOUT; break; } hdev->req_status = 0; hdev->req_result = 0; skb = hdev->req_rsp; hdev->req_rsp = NULL; bt_dev_dbg(hdev, "end: err %d", err); if (err < 0) { kfree_skb(skb); return ERR_PTR(err); } /* If command return a status event skb will be set to NULL as there are * no parameters. */ if (!skb) return ERR_PTR(-ENODATA); return skb; } EXPORT_SYMBOL(__hci_cmd_sync_sk); /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { return __hci_cmd_sync_sk(hdev, opcode, plen, param, 0, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync); /* Send HCI command and wait for command complete event */ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { struct sk_buff *skb; if (!test_bit(HCI_UP, &hdev->flags)) return ERR_PTR(-ENETDOWN); bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen); hci_req_sync_lock(hdev); skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout); hci_req_sync_unlock(hdev); return skb; } EXPORT_SYMBOL(hci_cmd_sync); /* This function requires the caller holds hdev->req_lock. */ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout) { return __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync_ev); /* This function requires the caller holds hdev->req_lock. */ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout, struct sock *sk) { struct sk_buff *skb; u8 status; skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk); /* If command return a status event, skb will be set to -ENODATA */ if (skb == ERR_PTR(-ENODATA)) return 0; if (IS_ERR(skb)) { if (!event) bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld", opcode, PTR_ERR(skb)); return PTR_ERR(skb); } status = skb->data[0]; kfree_skb(skb); return status; } EXPORT_SYMBOL(__hci_cmd_sync_status_sk); int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { return __hci_cmd_sync_status_sk(hdev, opcode, plen, param, 0, timeout, NULL); } EXPORT_SYMBOL(__hci_cmd_sync_status); int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u32 timeout) { int err; hci_req_sync_lock(hdev); err = __hci_cmd_sync_status(hdev, opcode, plen, param, timeout); hci_req_sync_unlock(hdev); return err; } EXPORT_SYMBOL(hci_cmd_sync_status); static void hci_cmd_sync_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work); bt_dev_dbg(hdev, ""); /* Dequeue all entries and run them */ while (1) { struct hci_cmd_sync_work_entry *entry; mutex_lock(&hdev->cmd_sync_work_lock); entry = list_first_entry_or_null(&hdev->cmd_sync_work_list, struct hci_cmd_sync_work_entry, list); if (entry) list_del(&entry->list); mutex_unlock(&hdev->cmd_sync_work_lock); if (!entry) break; bt_dev_dbg(hdev, "entry %p", entry); if (entry->func) { int err; hci_req_sync_lock(hdev); err = entry->func(hdev, entry->data); if (entry->destroy) entry->destroy(hdev, entry->data, err); hci_req_sync_unlock(hdev); } kfree(entry); } } static void hci_cmd_sync_cancel_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_cancel_work); cancel_delayed_work_sync(&hdev->cmd_timer); cancel_delayed_work_sync(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); wake_up_interruptible(&hdev->req_wait_q); } static int hci_scan_disable_sync(struct hci_dev *hdev); static int scan_disable_sync(struct hci_dev *hdev, void *data) { return hci_scan_disable_sync(hdev); } static int interleaved_inquiry_sync(struct hci_dev *hdev, void *data) { return hci_inquiry_sync(hdev, DISCOV_INTERLEAVED_INQUIRY_LEN, 0); } static void le_scan_disable(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, le_scan_disable.work); int status; bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) goto _return; status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL); if (status) { bt_dev_err(hdev, "failed to disable LE scan: %d", status); goto _return; } /* If we were running LE only scan, change discovery state. If * we were running both LE and BR/EDR inquiry simultaneously, * and BR/EDR inquiry is already finished, stop discovery, * otherwise BR/EDR inquiry will stop discovery when finished. * If we will resolve remote device name, do not change * discovery state. */ if (hdev->discovery.type == DISCOV_TYPE_LE) goto discov_stopped; if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) goto _return; if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { if (!test_bit(HCI_INQUIRY, &hdev->flags) && hdev->discovery.state != DISCOVERY_RESOLVING) goto discov_stopped; goto _return; } status = hci_cmd_sync_queue(hdev, interleaved_inquiry_sync, NULL, NULL); if (status) { bt_dev_err(hdev, "inquiry failed: status %d", status); goto discov_stopped; } goto _return; discov_stopped: hci_discovery_set_state(hdev, DISCOVERY_STOPPED); _return: hci_dev_unlock(hdev); } static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup); static int reenable_adv_sync(struct hci_dev *hdev, void *data) { bt_dev_dbg(hdev, ""); if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) return 0; if (hdev->cur_adv_instance) { return hci_schedule_adv_instance_sync(hdev, hdev->cur_adv_instance, true); } else { if (ext_adv_capable(hdev)) { hci_start_ext_adv_sync(hdev, 0x00); } else { hci_update_adv_data_sync(hdev, 0x00); hci_update_scan_rsp_data_sync(hdev, 0x00); hci_enable_advertising_sync(hdev); } } return 0; } static void reenable_adv(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, reenable_adv_work); int status; bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); status = hci_cmd_sync_queue(hdev, reenable_adv_sync, NULL, NULL); if (status) bt_dev_err(hdev, "failed to reenable ADV: %d", status); hci_dev_unlock(hdev); } static void cancel_adv_timeout(struct hci_dev *hdev) { if (hdev->adv_instance_timeout) { hdev->adv_instance_timeout = 0; cancel_delayed_work(&hdev->adv_instance_expire); } } /* For a single instance: * - force == true: The instance will be removed even when its remaining * lifetime is not zero. * - force == false: the instance will be deactivated but kept stored unless * the remaining lifetime is zero. * * For instance == 0x00: * - force == true: All instances will be removed regardless of their timeout * setting. * - force == false: Only instances that have a timeout will be removed. */ int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force) { struct adv_info *adv_instance, *n, *next_instance = NULL; int err; u8 rem_inst; /* Cancel any timeout concerning the removed instance(s). */ if (!instance || hdev->cur_adv_instance == instance) cancel_adv_timeout(hdev); /* Get the next instance to advertise BEFORE we remove * the current one. This can be the same instance again * if there is only one instance. */ if (instance && hdev->cur_adv_instance == instance) next_instance = hci_get_next_instance(hdev, instance); if (instance == 0x00) { list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { if (!(force || adv_instance->timeout)) continue; rem_inst = adv_instance->instance; err = hci_remove_adv_instance(hdev, rem_inst); if (!err) mgmt_advertising_removed(sk, hdev, rem_inst); } } else { adv_instance = hci_find_adv_instance(hdev, instance); if (force || (adv_instance && adv_instance->timeout && !adv_instance->remaining_time)) { /* Don't advertise a removed instance. */ if (next_instance && next_instance->instance == instance) next_instance = NULL; err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); } } if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return 0; if (next_instance && !ext_adv_capable(hdev)) return hci_schedule_adv_instance_sync(hdev, next_instance->instance, false); return 0; } static int adv_timeout_expire_sync(struct hci_dev *hdev, void *data) { u8 instance = *(u8 *)data; kfree(data); hci_clear_adv_instance_sync(hdev, NULL, instance, false); if (list_empty(&hdev->adv_instances)) return hci_disable_advertising_sync(hdev); return 0; } static void adv_timeout_expire(struct work_struct *work) { u8 *inst_ptr; struct hci_dev *hdev = container_of(work, struct hci_dev, adv_instance_expire.work); bt_dev_dbg(hdev, ""); hci_dev_lock(hdev); hdev->adv_instance_timeout = 0; if (hdev->cur_adv_instance == 0x00) goto unlock; inst_ptr = kmalloc(1, GFP_KERNEL); if (!inst_ptr) goto unlock; *inst_ptr = hdev->cur_adv_instance; hci_cmd_sync_queue(hdev, adv_timeout_expire_sync, inst_ptr, NULL); unlock: hci_dev_unlock(hdev); } static bool is_interleave_scanning(struct hci_dev *hdev) { return hdev->interleave_scan_state != INTERLEAVE_SCAN_NONE; } static int hci_passive_scan_sync(struct hci_dev *hdev); static void interleave_scan_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, interleave_scan.work); unsigned long timeout; if (hdev->interleave_scan_state == INTERLEAVE_SCAN_ALLOWLIST) { timeout = msecs_to_jiffies(hdev->advmon_allowlist_duration); } else if (hdev->interleave_scan_state == INTERLEAVE_SCAN_NO_FILTER) { timeout = msecs_to_jiffies(hdev->advmon_no_filter_duration); } else { bt_dev_err(hdev, "unexpected error"); return; } hci_passive_scan_sync(hdev); hci_dev_lock(hdev); switch (hdev->interleave_scan_state) { case INTERLEAVE_SCAN_ALLOWLIST: bt_dev_dbg(hdev, "next state: allowlist"); hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; break; case INTERLEAVE_SCAN_NO_FILTER: bt_dev_dbg(hdev, "next state: no filter"); hdev->interleave_scan_state = INTERLEAVE_SCAN_ALLOWLIST; break; case INTERLEAVE_SCAN_NONE: bt_dev_err(hdev, "unexpected error"); } hci_dev_unlock(hdev); /* Don't continue interleaving if it was canceled */ if (is_interleave_scanning(hdev)) queue_delayed_work(hdev->req_workqueue, &hdev->interleave_scan, timeout); } void hci_cmd_sync_init(struct hci_dev *hdev) { INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work); INIT_LIST_HEAD(&hdev->cmd_sync_work_list); mutex_init(&hdev->cmd_sync_work_lock); mutex_init(&hdev->unregister_lock); INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work); INIT_WORK(&hdev->reenable_adv_work, reenable_adv); INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable); INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work); } static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev, struct hci_cmd_sync_work_entry *entry, int err) { if (entry->destroy) entry->destroy(hdev, entry->data, err); list_del(&entry->list); kfree(entry); } void hci_cmd_sync_clear(struct hci_dev *hdev) { struct hci_cmd_sync_work_entry *entry, *tmp; cancel_work_sync(&hdev->cmd_sync_work); cancel_work_sync(&hdev->reenable_adv_work); mutex_lock(&hdev->cmd_sync_work_lock); list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); mutex_unlock(&hdev->cmd_sync_work_lock); } void hci_cmd_sync_cancel(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { hdev->req_result = err; hdev->req_status = HCI_REQ_CANCELED; queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work); } } EXPORT_SYMBOL(hci_cmd_sync_cancel); /* Cancel ongoing command request synchronously: * * - Set result and mark status to HCI_REQ_CANCELED * - Wakeup command sync thread */ void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); if (hdev->req_status == HCI_REQ_PEND) { /* req_result is __u32 so error must be positive to be properly * propagated. */ hdev->req_result = err < 0 ? -err : err; hdev->req_status = HCI_REQ_CANCELED; wake_up_interruptible(&hdev->req_wait_q); } } EXPORT_SYMBOL(hci_cmd_sync_cancel_sync); /* Submit HCI command to be run in as cmd_sync_work: * * - hdev must _not_ be unregistered */ int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; int err = 0; mutex_lock(&hdev->unregister_lock); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { err = -ENODEV; goto unlock; } entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { err = -ENOMEM; goto unlock; } entry->func = func; entry->data = data; entry->destroy = destroy; mutex_lock(&hdev->cmd_sync_work_lock); list_add_tail(&entry->list, &hdev->cmd_sync_work_list); mutex_unlock(&hdev->cmd_sync_work_lock); queue_work(hdev->req_workqueue, &hdev->cmd_sync_work); unlock: mutex_unlock(&hdev->unregister_lock); return err; } EXPORT_SYMBOL(hci_cmd_sync_submit); /* Queue HCI command: * * - hdev must be running */ int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { /* Only queue command if hdev is running which means it had been opened * and is either on init phase or is already up. */ if (!test_bit(HCI_RUNNING, &hdev->flags)) return -ENETDOWN; return hci_cmd_sync_submit(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_queue); static struct hci_cmd_sync_work_entry * _hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) { if (func && entry->func != func) continue; if (data && entry->data != data) continue; if (destroy && entry->destroy != destroy) continue; return entry; } return NULL; } /* Queue HCI command entry once: * * - Lookup if an entry already exist and only if it doesn't creates a new entry * and queue it. */ int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) return 0; return hci_cmd_sync_queue(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_queue_once); /* Run HCI command: * * - hdev must be running * - if on cmd_sync_work then run immediately otherwise queue */ int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { /* Only queue command if hdev is running which means it had been opened * and is either on init phase or is already up. */ if (!test_bit(HCI_RUNNING, &hdev->flags)) return -ENETDOWN; /* If on cmd_sync_work then run immediately otherwise queue */ if (current_work() == &hdev->cmd_sync_work) return func(hdev, data); return hci_cmd_sync_submit(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_run); /* Run HCI command entry once: * * - Lookup if an entry already exist and only if it doesn't creates a new entry * and run it. * - if on cmd_sync_work then run immediately otherwise queue */ int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) return 0; return hci_cmd_sync_run(hdev, func, data, destroy); } EXPORT_SYMBOL(hci_cmd_sync_run_once); /* Lookup HCI command entry: * * - Return first entry that matches by function callback or data or * destroy callback. */ struct hci_cmd_sync_work_entry * hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; mutex_lock(&hdev->cmd_sync_work_lock); entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy); mutex_unlock(&hdev->cmd_sync_work_lock); return entry; } EXPORT_SYMBOL(hci_cmd_sync_lookup_entry); /* Cancel HCI command entry */ void hci_cmd_sync_cancel_entry(struct hci_dev *hdev, struct hci_cmd_sync_work_entry *entry) { mutex_lock(&hdev->cmd_sync_work_lock); _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); mutex_unlock(&hdev->cmd_sync_work_lock); } EXPORT_SYMBOL(hci_cmd_sync_cancel_entry); /* Dequeue one HCI command entry: * * - Lookup and cancel first entry that matches. */ bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy); if (!entry) return false; hci_cmd_sync_cancel_entry(hdev, entry); return true; } EXPORT_SYMBOL(hci_cmd_sync_dequeue_once); /* Dequeue HCI command entry: * * - Lookup and cancel any entry that matches by function callback or data or * destroy callback. */ bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, void *data, hci_cmd_sync_work_destroy_t destroy) { struct hci_cmd_sync_work_entry *entry; bool ret = false; mutex_lock(&hdev->cmd_sync_work_lock); while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy))) { _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); ret = true; } mutex_unlock(&hdev->cmd_sync_work_lock); return ret; } EXPORT_SYMBOL(hci_cmd_sync_dequeue); int hci_update_eir_sync(struct hci_dev *hdev) { struct hci_cp_write_eir cp; bt_dev_dbg(hdev, ""); if (!hdev_is_powered(hdev)) return 0; if (!lmp_ext_inq_capable(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return 0; memset(&cp, 0, sizeof(cp)); eir_create(hdev, cp.data); if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) return 0; memcpy(hdev->eir, cp.data, sizeof(cp.data)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static u8 get_service_classes(struct hci_dev *hdev) { struct bt_uuid *uuid; u8 val = 0; list_for_each_entry(uuid, &hdev->uuids, list) val |= uuid->svc_hint; return val; } int hci_update_class_sync(struct hci_dev *hdev) { u8 cod[3]; bt_dev_dbg(hdev, ""); if (!hdev_is_powered(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) return 0; cod[0] = hdev->minor_class; cod[1] = hdev->major_class; cod[2] = get_service_classes(hdev); if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) cod[1] |= 0x20; if (memcmp(cod, hdev->dev_class, 3) == 0) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod, HCI_CMD_TIMEOUT); } static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) { /* If there is no connection we are OK to advertise. */ if (hci_conn_num(hdev, LE_LINK) == 0) return true; /* Check le_states if there is any connection in peripheral role. */ if (hdev->conn_hash.le_num_peripheral > 0) { /* Peripheral connection state and non connectable mode * bit 20. */ if (!connectable && !(hdev->le_states[2] & 0x10)) return false; /* Peripheral connection state and connectable mode bit 38 * and scannable bit 21. */ if (connectable && (!(hdev->le_states[4] & 0x40) || !(hdev->le_states[2] & 0x20))) return false; } /* Check le_states if there is any connection in central role. */ if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) { /* Central connection state and non connectable mode bit 18. */ if (!connectable && !(hdev->le_states[2] & 0x02)) return false; /* Central connection state and connectable mode bit 35 and * scannable 19. */ if (connectable && (!(hdev->le_states[4] & 0x08) || !(hdev->le_states[2] & 0x08))) return false; } return true; } static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) { /* If privacy is not enabled don't use RPA */ if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) return false; /* If basic privacy mode is enabled use RPA */ if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) return true; /* If limited privacy mode is enabled don't use RPA if we're * both discoverable and bondable. */ if ((flags & MGMT_ADV_FLAG_DISCOV) && hci_dev_test_flag(hdev, HCI_BONDABLE)) return false; /* We're neither bondable nor discoverable in the limited * privacy mode, therefore use RPA. */ return true; } static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { /* If a random_addr has been set we're advertising or initiating an LE * connection we can't go ahead and change the random address at this * time. This is because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). * * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ if (bacmp(&hdev->random_addr, BDADDR_ANY) && (hci_dev_test_flag(hdev, HCI_LE_ADV) || hci_lookup_le_connect(hdev))) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa, HCI_CMD_TIMEOUT); } int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy, bool rpa, u8 *own_addr_type) { int err; /* If privacy is enabled use a resolvable private address. If * current RPA has expired or there is something else than * the current RPA in use, then generate a new one. */ if (rpa) { /* If Controller supports LL Privacy use own address type is * 0x03 */ if (ll_privacy_capable(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; /* Check if RPA is valid */ if (rpa_valid(hdev)) return 0; err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { bt_dev_err(hdev, "failed to generate new RPA"); return err; } err = hci_set_random_addr_sync(hdev, &hdev->rpa); if (err) return err; return 0; } /* In case of required privacy without resolvable private address, * use an non-resolvable private address. This is useful for active * scanning and non-connectable advertising. */ if (require_privacy) { bdaddr_t nrpa; while (true) { /* The non-resolvable private address is generated * from random six bytes with the two most significant * bits cleared. */ get_random_bytes(&nrpa, 6); nrpa.b[5] &= 0x3f; /* The non-resolvable private address shall not be * equal to the public address. */ if (bacmp(&hdev->bdaddr, &nrpa)) break; } *own_addr_type = ADDR_LE_DEV_RANDOM; return hci_set_random_addr_sync(hdev, &nrpa); } /* If forcing static address is in use or there is no public * address use the static address as random address (but skip * the HCI command if the current random address is already the * static one. * * In case BR/EDR has been disabled on a dual-mode controller * and a static address has been configured, then use that * address instead of the public BR/EDR address. */ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { *own_addr_type = ADDR_LE_DEV_RANDOM; if (bacmp(&hdev->static_addr, &hdev->random_addr)) return hci_set_random_addr_sync(hdev, &hdev->static_addr); return 0; } /* Neither privacy nor static address is being used so use a * public address. */ *own_addr_type = ADDR_LE_DEV_PUBLIC; return 0; } static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *set; u8 data[sizeof(*cp) + sizeof(*set) * 1]; u8 size; struct adv_info *adv = NULL; /* If request specifies an instance that doesn't exist, fail */ if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; /* If not enabled there is nothing to do */ if (!adv->enabled) return 0; } memset(data, 0, sizeof(data)); cp = (void *)data; set = (void *)cp->data; /* Instance 0x00 indicates all advertising instances will be disabled */ cp->num_of_sets = !!instance; cp->enable = 0x00; set->handle = adv ? adv->handle : instance; size = sizeof(*cp) + sizeof(*set) * cp->num_of_sets; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, size, data, HCI_CMD_TIMEOUT); } static int hci_set_adv_set_random_addr_sync(struct hci_dev *hdev, u8 instance, bdaddr_t *random_addr) { struct hci_cp_le_set_adv_set_rand_addr cp; int err; if (!instance) { /* Instance 0x00 doesn't have an adv_info, instead it uses * hdev->random_addr to track its address so whenever it needs * to be updated this also set the random address since * hdev->random_addr is shared with scan state machine. */ err = hci_set_random_addr_sync(hdev, random_addr); if (err) return err; } memset(&cp, 0, sizeof(cp)); cp.handle = instance; bacpy(&cp.bdaddr, random_addr); return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; bool connectable; u32 flags; bdaddr_t random_addr; u8 own_addr_type; int err; struct adv_info *adv; bool secondary_adv; if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; } else { adv = NULL; } /* Updating parameters of an active instance will return a * Command Disallowed error, so we must first disable the * instance if it is active. */ if (adv && !adv->pending) { err = hci_disable_ext_adv_instance_sync(hdev, instance); if (err) return err; } flags = hci_adv_instance_flags(hdev, instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. */ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || mgmt_get_connectable(hdev); if (!is_advertising_allowed(hdev, connectable)) return -EPERM; /* Set require_privacy to true only when non-connectable * advertising is used. In that case it is fine to use a * non-resolvable private address. */ err = hci_get_random_address(hdev, !connectable, adv_use_rpa(hdev, flags), adv, &own_addr_type, &random_addr); if (err < 0) return err; memset(&cp, 0, sizeof(cp)); if (adv) { hci_cpu_to_le24(adv->min_interval, cp.min_interval); hci_cpu_to_le24(adv->max_interval, cp.max_interval); cp.tx_power = adv->tx_power; } else { hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; } secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); if (connectable) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); } else if (hci_adv_instance_is_scannable(hdev, instance) || (flags & MGMT_ADV_PARAM_SCAN_RSP)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); } else { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); } /* If Own_Address_Type equals 0x02 or 0x03, the Peer_Address parameter * contains the peer’s Identity Address and the Peer_Address_Type * parameter contains the peer’s Identity Type (i.e., 0x00 or 0x01). * These parameters are used to locate the corresponding local IRK in * the resolving list; this IRK is used to generate their own address * used in the advertisement. */ if (own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) hci_copy_identity_address(hdev, &cp.peer_addr, &cp.peer_addr_type); cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.handle = adv ? adv->handle : instance; if (flags & MGMT_ADV_FLAG_SEC_2M) { cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_2M; } else if (flags & MGMT_ADV_FLAG_SEC_CODED) { cp.primary_phy = HCI_ADV_PHY_CODED; cp.secondary_phy = HCI_ADV_PHY_CODED; } else { /* In all other cases use 1M */ cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_1M; } err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) return err; if ((own_addr_type == ADDR_LE_DEV_RANDOM || own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) && bacmp(&random_addr, BDADDR_ANY)) { /* Check if random address need to be updated */ if (adv) { if (!bacmp(&random_addr, &adv->random_addr)) return 0; } else { if (!bacmp(&random_addr, &hdev->random_addr)) return 0; } return hci_set_adv_set_random_addr_sync(hdev, instance, &random_addr); } return 0; } static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_ext_scan_rsp_data, pdu, data, length, HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->scan_rsp_changed) return 0; } len = eir_create_scan_rsp(hdev, instance, pdu->data); pdu->handle = adv ? adv->handle : instance; pdu->length = len; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; if (adv) { adv->scan_rsp_changed = false; } else { memcpy(hdev->scan_rsp_data, pdu->data, len); hdev->scan_rsp_data_len = len; } return 0; } static int __hci_set_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_scan_rsp_data cp; u8 len; memset(&cp, 0, sizeof(cp)); len = eir_create_scan_rsp(hdev, instance, cp.data); if (hdev->scan_rsp_data_len == len && !memcmp(cp.data, hdev->scan_rsp_data, len)) return 0; memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); hdev->scan_rsp_data_len = len; cp.length = len; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) { if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; if (ext_adv_capable(hdev)) return hci_set_ext_scan_rsp_data_sync(hdev, instance); return __hci_set_scan_rsp_data_sync(hdev, instance); } int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_enable *cp; struct hci_cp_ext_adv_set *set; u8 data[sizeof(*cp) + sizeof(*set) * 1]; struct adv_info *adv; if (instance > 0) { adv = hci_find_adv_instance(hdev, instance); if (!adv) return -EINVAL; /* If already enabled there is nothing to do */ if (adv->enabled) return 0; } else { adv = NULL; } cp = (void *)data; set = (void *)cp->data; memset(cp, 0, sizeof(*cp)); cp->enable = 0x01; cp->num_of_sets = 0x01; memset(set, 0, sizeof(*set)); set->handle = adv ? adv->handle : instance; /* Set duration per instance since controller is responsible for * scheduling it. */ if (adv && adv->timeout) { u16 duration = adv->timeout * MSEC_PER_SEC; /* Time = N * 10 ms */ set->duration = cpu_to_le16(duration / 10); } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(*cp) + sizeof(*set) * cp->num_of_sets, data, HCI_CMD_TIMEOUT); } int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance) { int err; err = hci_setup_ext_adv_instance_sync(hdev, instance); if (err) return err; err = hci_set_ext_scan_rsp_data_sync(hdev, instance); if (err) return err; return hci_enable_ext_advertising_sync(hdev, instance); } int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; struct adv_info *adv = NULL; /* If periodic advertising already disabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->periodic || !adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x00; cp.handle = instance; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance, u16 min_interval, u16 max_interval) { struct hci_cp_le_set_per_adv_params cp; memset(&cp, 0, sizeof(cp)); if (!min_interval) min_interval = DISCOV_LE_PER_ADV_INT_MIN; if (!max_interval) max_interval = DISCOV_LE_PER_ADV_INT_MAX; cp.handle = instance; cp.min_interval = cpu_to_le16(min_interval); cp.max_interval = cpu_to_le16(max_interval); cp.periodic_properties = 0x0000; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length, HCI_MAX_PER_AD_LENGTH); u8 len; struct adv_info *adv = NULL; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->periodic) return 0; } len = eir_create_per_adv_data(hdev, instance, pdu->data); pdu->length = len; pdu->handle = adv ? adv->handle : instance; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); } static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_per_adv_enable cp; struct adv_info *adv = NULL; /* If periodic advertising already enabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); if (adv && adv->periodic && adv->enabled) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x01; cp.handle = instance; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Checks if periodic advertising data contains a Basic Announcement and if it * does generates a Broadcast ID and add Broadcast Announcement. */ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) { u8 bid[3]; u8 ad[4 + 3]; /* Skip if NULL adv as instance 0x00 is used for general purpose * advertising so it cannot used for the likes of Broadcast Announcement * as it can be overwritten at any point. */ if (!adv) return 0; /* Check if PA data doesn't contains a Basic Audio Announcement then * there is nothing to do. */ if (!eir_get_service_data(adv->per_adv_data, adv->per_adv_data_len, 0x1851, NULL)) return 0; /* Check if advertising data already has a Broadcast Announcement since * the process may want to control the Broadcast ID directly and in that * case the kernel shall no interfere. */ if (eir_get_service_data(adv->adv_data, adv->adv_data_len, 0x1852, NULL)) return 0; /* Generate Broadcast ID */ get_random_bytes(bid, sizeof(bid)); eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid)); hci_set_adv_instance_data(hdev, adv->instance, sizeof(ad), ad, 0, NULL); return hci_update_adv_data_sync(hdev, adv->instance); } int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval) { struct adv_info *adv = NULL; int err; bool added = false; hci_disable_per_advertising_sync(hdev, instance); if (instance) { adv = hci_find_adv_instance(hdev, instance); /* Create an instance if that could not be found */ if (!adv) { adv = hci_add_per_instance(hdev, instance, flags, data_len, data, sync_interval, sync_interval); if (IS_ERR(adv)) return PTR_ERR(adv); adv->pending = false; added = true; } } /* Start advertising */ err = hci_start_ext_adv_sync(hdev, instance); if (err < 0) goto fail; err = hci_adv_bcast_annoucement(hdev, adv); if (err < 0) goto fail; err = hci_set_per_adv_params_sync(hdev, instance, min_interval, max_interval); if (err < 0) goto fail; err = hci_set_per_adv_data_sync(hdev, instance); if (err < 0) goto fail; err = hci_enable_per_advertising_sync(hdev, instance); if (err < 0) goto fail; return 0; fail: if (added) hci_remove_adv_instance(hdev, instance); return err; } static int hci_start_adv_sync(struct hci_dev *hdev, u8 instance) { int err; if (ext_adv_capable(hdev)) return hci_start_ext_adv_sync(hdev, instance); err = hci_update_adv_data_sync(hdev, instance); if (err) return err; err = hci_update_scan_rsp_data_sync(hdev, instance); if (err) return err; return hci_enable_advertising_sync(hdev); } int hci_enable_advertising_sync(struct hci_dev *hdev) { struct adv_info *adv_instance; struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; u16 adv_min_interval, adv_max_interval; u32 flags; u8 status; if (ext_adv_capable(hdev)) return hci_enable_ext_advertising_sync(hdev, hdev->cur_adv_instance); flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance); adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. */ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || mgmt_get_connectable(hdev); if (!is_advertising_allowed(hdev, connectable)) return -EINVAL; status = hci_disable_advertising_sync(hdev); if (status) return status; /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to true only when non-connectable * advertising is used. In that case it is fine to use a * non-resolvable private address. */ status = hci_update_random_address_sync(hdev, !connectable, adv_use_rpa(hdev, flags), &own_addr_type); if (status) return status; memset(&cp, 0, sizeof(cp)); if (adv_instance) { adv_min_interval = adv_instance->min_interval; adv_max_interval = adv_instance->max_interval; } else { adv_min_interval = hdev->le_adv_min_interval; adv_max_interval = hdev->le_adv_max_interval; } if (connectable) { cp.type = LE_ADV_IND; } else { if (hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance)) cp.type = LE_ADV_SCAN_IND; else cp.type = LE_ADV_NONCONN_IND; if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) || hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN; adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX; } } cp.min_interval = cpu_to_le16(adv_min_interval); cp.max_interval = cpu_to_le16(adv_max_interval); cp.own_address_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (status) return status; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static int enable_advertising_sync(struct hci_dev *hdev, void *data) { return hci_enable_advertising_sync(hdev); } int hci_enable_advertising(struct hci_dev *hdev) { if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) return 0; return hci_cmd_sync_queue(hdev, enable_advertising_sync, NULL, NULL); } int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { int err; if (!ext_adv_capable(hdev)) return 0; err = hci_disable_ext_adv_instance_sync(hdev, instance); if (err) return err; /* If request specifies an instance that doesn't exist, fail */ if (instance > 0 && !hci_find_adv_instance(hdev, instance)) return -EINVAL; return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_REMOVE_ADV_SET, sizeof(instance), &instance, 0, HCI_CMD_TIMEOUT, sk); } int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason) { struct hci_cp_le_term_big cp; memset(&cp, 0, sizeof(cp)); cp.handle = handle; cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_LE_TERM_BIG, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) { DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length, HCI_MAX_EXT_AD_LENGTH); u8 len; struct adv_info *adv = NULL; int err; if (instance) { adv = hci_find_adv_instance(hdev, instance); if (!adv || !adv->adv_data_changed) return 0; } len = eir_create_adv_data(hdev, instance, pdu->data); pdu->length = len; pdu->handle = adv ? adv->handle : instance; pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, struct_size(pdu, data, len), pdu, HCI_CMD_TIMEOUT); if (err) return err; /* Update data if the command succeed */ if (adv) { adv->adv_data_changed = false; } else { memcpy(hdev->adv_data, pdu->data, len); hdev->adv_data_len = len; } return 0; } static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_adv_data cp; u8 len; memset(&cp, 0, sizeof(cp)); len = eir_create_adv_data(hdev, instance, cp.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && memcmp(cp.data, hdev->adv_data, len) == 0) return 0; memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); hdev->adv_data_len = len; cp.length = len; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance) { if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; if (ext_adv_capable(hdev)) return hci_set_ext_adv_data_sync(hdev, instance); return hci_set_adv_data_sync(hdev, instance); } int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance, bool force) { struct adv_info *adv = NULL; u16 timeout; if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && !ext_adv_capable(hdev)) return -EPERM; if (hdev->adv_instance_timeout) return -EBUSY; adv = hci_find_adv_instance(hdev, instance); if (!adv) return -ENOENT; /* A zero timeout means unlimited advertising. As long as there is * only one instance, duration should be ignored. We still set a timeout * in case further instances are being added later on. * * If the remaining lifetime of the instance is more than the duration * then the timeout corresponds to the duration, otherwise it will be * reduced to the remaining instance lifetime. */ if (adv->timeout == 0 || adv->duration <= adv->remaining_time) timeout = adv->duration; else timeout = adv->remaining_time; /* The remaining time is being reduced unless the instance is being * advertised without time limit. */ if (adv->timeout) adv->remaining_time = adv->remaining_time - timeout; /* Only use work for scheduling instances with legacy advertising */ if (!ext_adv_capable(hdev)) { hdev->adv_instance_timeout = timeout; queue_delayed_work(hdev->req_workqueue, &hdev->adv_instance_expire, secs_to_jiffies(timeout)); } /* If we're just re-scheduling the same instance again then do not * execute any HCI commands. This happens when a single instance is * being advertised. */ if (!force && hdev->cur_adv_instance == instance && hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; hdev->cur_adv_instance = instance; return hci_start_adv_sync(hdev, instance); } static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk) { int err; if (!ext_adv_capable(hdev)) return 0; /* Disable instance 0x00 to disable all instances */ err = hci_disable_ext_adv_instance_sync(hdev, 0x00); if (err) return err; return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force) { struct adv_info *adv, *n; int err = 0; if (ext_adv_capable(hdev)) /* Remove all existing sets */ err = hci_clear_adv_sets_sync(hdev, sk); if (ext_adv_capable(hdev)) return err; /* This is safe as long as there is no command send while the lock is * held. */ hci_dev_lock(hdev); /* Cleanup non-ext instances */ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) { u8 instance = adv->instance; int err; if (!(force || adv->timeout)) continue; err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); } hci_dev_unlock(hdev); return 0; } static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { int err = 0; /* If we use extended advertising, instance has to be removed first. */ if (ext_adv_capable(hdev)) err = hci_remove_ext_adv_instance_sync(hdev, instance, sk); if (ext_adv_capable(hdev)) return err; /* This is safe as long as there is no command send while the lock is * held. */ hci_dev_lock(hdev); err = hci_remove_adv_instance(hdev, instance); if (!err) mgmt_advertising_removed(sk, hdev, instance); hci_dev_unlock(hdev); return err; } /* For a single instance: * - force == true: The instance will be removed even when its remaining * lifetime is not zero. * - force == false: the instance will be deactivated but kept stored unless * the remaining lifetime is zero. * * For instance == 0x00: * - force == true: All instances will be removed regardless of their timeout * setting. * - force == false: Only instances that have a timeout will be removed. */ int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force) { struct adv_info *next = NULL; int err; /* Cancel any timeout concerning the removed instance(s). */ if (!instance || hdev->cur_adv_instance == instance) cancel_adv_timeout(hdev); /* Get the next instance to advertise BEFORE we remove * the current one. This can be the same instance again * if there is only one instance. */ if (hdev->cur_adv_instance == instance) next = hci_get_next_instance(hdev, instance); if (!instance) { err = hci_clear_adv_sync(hdev, sk, force); if (err) return err; } else { struct adv_info *adv = hci_find_adv_instance(hdev, instance); if (force || (adv && adv->timeout && !adv->remaining_time)) { /* Don't advertise a removed instance. */ if (next && next->instance == instance) next = NULL; err = hci_remove_adv_sync(hdev, instance, sk); if (err) return err; } } if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) return 0; if (next && !ext_adv_capable(hdev)) hci_schedule_adv_instance_sync(hdev, next->instance, false); return 0; } int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle) { struct hci_cp_read_rssi cp; cp.handle = handle; return __hci_cmd_sync_status(hdev, HCI_OP_READ_RSSI, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_read_clock_sync(struct hci_dev *hdev, struct hci_cp_read_clock *cp) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLOCK, sizeof(*cp), cp, HCI_CMD_TIMEOUT); } int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type) { struct hci_cp_read_tx_power cp; cp.handle = handle; cp.type = type; return __hci_cmd_sync_status(hdev, HCI_OP_READ_TX_POWER, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_disable_advertising_sync(struct hci_dev *hdev) { u8 enable = 0x00; int err = 0; /* If controller is not advertising we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; if (ext_adv_capable(hdev)) err = hci_disable_ext_adv_instance_sync(hdev, 0x00); if (ext_adv_capable(hdev)) return err; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static int hci_le_set_ext_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup) { struct hci_cp_le_set_ext_scan_enable cp; memset(&cp, 0, sizeof(cp)); cp.enable = val; if (hci_dev_test_flag(hdev, HCI_MESH)) cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; else cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, u8 filter_dup) { struct hci_cp_le_set_scan_enable cp; if (use_ext_scan(hdev)) return hci_le_set_ext_scan_enable_sync(hdev, val, filter_dup); memset(&cp, 0, sizeof(cp)); cp.enable = val; if (val && hci_dev_test_flag(hdev, HCI_MESH)) cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; else cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_set_addr_resolution_enable_sync(struct hci_dev *hdev, u8 val) { if (!ll_privacy_capable(hdev)) return 0; /* If controller is not/already resolving we are done. */ if (val == hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, sizeof(val), &val, HCI_CMD_TIMEOUT); } static int hci_scan_disable_sync(struct hci_dev *hdev) { int err; /* If controller is not scanning we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) return 0; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00); if (err) { bt_dev_err(hdev, "Unable to disable scanning: %d", err); return err; } return err; } static bool scan_use_rpa(struct hci_dev *hdev) { return hci_dev_test_flag(hdev, HCI_PRIVACY); } static void hci_start_interleave_scan(struct hci_dev *hdev) { hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; queue_delayed_work(hdev->req_workqueue, &hdev->interleave_scan, 0); } static void cancel_interleave_scan(struct hci_dev *hdev) { bt_dev_dbg(hdev, "cancelling interleave scan"); cancel_delayed_work_sync(&hdev->interleave_scan); hdev->interleave_scan_state = INTERLEAVE_SCAN_NONE; } /* Return true if interleave_scan wasn't started until exiting this function, * otherwise, return false */ static bool hci_update_interleaved_scan_sync(struct hci_dev *hdev) { /* Do interleaved scan only if all of the following are true: * - There is at least one ADV monitor * - At least one pending LE connection or one device to be scanned for * - Monitor offloading is not supported * If so, we should alternate between allowlist scan and one without * any filters to save power. */ bool use_interleaving = hci_is_adv_monitoring(hdev) && !(list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports)) && hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE; bool is_interleaving = is_interleave_scanning(hdev); if (use_interleaving && !is_interleaving) { hci_start_interleave_scan(hdev); bt_dev_dbg(hdev, "starting interleave scan"); return true; } if (!use_interleaving && is_interleaving) cancel_interleave_scan(hdev); return false; } /* Removes connection to resolve list if needed.*/ static int hci_le_del_resolve_list_sync(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct hci_cp_le_del_from_resolv_list cp; struct bdaddr_list_with_irk *entry; if (!ll_privacy_capable(hdev)) return 0; /* Check if the IRK has been programmed */ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, bdaddr, bdaddr_type); if (!entry) return 0; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); return __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_del_accept_list_sync(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct hci_cp_le_del_from_accept_list cp; int err; /* Check if device is on accept list before removing it */ if (!hci_bdaddr_list_lookup(&hdev->le_accept_list, bdaddr, bdaddr_type)) return 0; cp.bdaddr_type = bdaddr_type; bacpy(&cp.bdaddr, bdaddr); /* Ignore errors when removing from resolving list as that is likely * that the device was never added. */ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) { bt_dev_err(hdev, "Unable to remove from allow list: %d", err); return err; } bt_dev_dbg(hdev, "Remove %pMR (0x%x) from allow list", &cp.bdaddr, cp.bdaddr_type); return 0; } struct conn_params { bdaddr_t addr; u8 addr_type; hci_conn_flags_t flags; u8 privacy_mode; }; /* Adds connection to resolve list if needed. * Setting params to NULL programs local hdev->irk */ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, struct conn_params *params) { struct hci_cp_le_add_to_resolv_list cp; struct smp_irk *irk; struct bdaddr_list_with_irk *entry; struct hci_conn_params *p; if (!ll_privacy_capable(hdev)) return 0; /* Attempt to program local identity address, type and irk if params is * NULL. */ if (!params) { if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) return 0; hci_copy_identity_address(hdev, &cp.bdaddr, &cp.bdaddr_type); memcpy(cp.peer_irk, hdev->irk, 16); goto done; } else if (!(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION)) return 0; irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type); if (!irk) return 0; /* Check if the IK has _not_ been programmed yet. */ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, &params->addr, params->addr_type); if (entry) return 0; cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, &params->addr); memcpy(cp.peer_irk, irk->val, 16); /* Default privacy mode is always Network */ params->privacy_mode = HCI_NETWORK_PRIVACY; rcu_read_lock(); p = hci_pend_le_action_lookup(&hdev->pend_le_conns, &params->addr, params->addr_type); if (!p) p = hci_pend_le_action_lookup(&hdev->pend_le_reports, &params->addr, params->addr_type); if (p) WRITE_ONCE(p->privacy_mode, HCI_NETWORK_PRIVACY); rcu_read_unlock(); done: if (hci_dev_test_flag(hdev, HCI_PRIVACY)) memcpy(cp.local_irk, hdev->irk, 16); else memset(cp.local_irk, 0, 16); return __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Set Device Privacy Mode. */ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, struct conn_params *params) { struct hci_cp_le_set_privacy_mode cp; struct smp_irk *irk; if (!ll_privacy_capable(hdev) || !(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION)) return 0; /* If device privacy mode has already been set there is nothing to do */ if (params->privacy_mode == HCI_DEVICE_PRIVACY) return 0; /* Check if HCI_CONN_FLAG_DEVICE_PRIVACY has been set as it also * indicates that LL Privacy has been enabled and * HCI_OP_LE_SET_PRIVACY_MODE is supported. */ if (!(params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)) return 0; irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type); if (!irk) return 0; memset(&cp, 0, sizeof(cp)); cp.bdaddr_type = irk->addr_type; bacpy(&cp.bdaddr, &irk->bdaddr); cp.mode = HCI_DEVICE_PRIVACY; /* Note: params->privacy_mode is not updated since it is a copy */ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PRIVACY_MODE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Adds connection to allow list if needed, if the device uses RPA (has IRK) * this attempts to program the device in the resolving list as well and * properly set the privacy mode. */ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, struct conn_params *params, u8 *num_entries) { struct hci_cp_le_add_to_accept_list cp; int err; /* During suspend, only wakeable devices can be in acceptlist */ if (hdev->suspended && !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) { hci_le_del_accept_list_sync(hdev, &params->addr, params->addr_type); return 0; } /* Select filter policy to accept all advertising */ if (*num_entries >= hdev->le_accept_list_size) return -ENOSPC; /* Attempt to program the device in the resolving list first to avoid * having to rollback in case it fails since the resolving list is * dynamic it can probably be smaller than the accept list. */ err = hci_le_add_resolve_list_sync(hdev, params); if (err) { bt_dev_err(hdev, "Unable to add to resolve list: %d", err); return err; } /* Set Privacy Mode */ err = hci_le_set_privacy_mode_sync(hdev, params); if (err) { bt_dev_err(hdev, "Unable to set privacy mode: %d", err); return err; } /* Check if already in accept list */ if (hci_bdaddr_list_lookup(&hdev->le_accept_list, &params->addr, params->addr_type)) return 0; *num_entries += 1; cp.bdaddr_type = params->addr_type; bacpy(&cp.bdaddr, &params->addr); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) { bt_dev_err(hdev, "Unable to add to allow list: %d", err); /* Rollback the device from the resolving list */ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type); return err; } bt_dev_dbg(hdev, "Add %pMR (0x%x) to allow list", &cp.bdaddr, cp.bdaddr_type); return 0; } /* This function disables/pause all advertising instances */ static int hci_pause_advertising_sync(struct hci_dev *hdev) { int err; int old_state; /* If already been paused there is nothing to do. */ if (hdev->advertising_paused) return 0; bt_dev_dbg(hdev, "Pausing directed advertising"); /* Stop directed advertising */ old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING); if (old_state) { /* When discoverable timeout triggers, then just make sure * the limited discoverable flag is cleared. Even in the case * of a timeout triggered from general discoverable, it is * safe to unconditionally clear the flag. */ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hdev->discov_timeout = 0; } bt_dev_dbg(hdev, "Pausing advertising instances"); /* Call to disable any advertisements active on the controller. * This will succeed even if no advertisements are configured. */ err = hci_disable_advertising_sync(hdev); if (err) return err; /* If we are using software rotation, pause the loop */ if (!ext_adv_capable(hdev)) cancel_adv_timeout(hdev); hdev->advertising_paused = true; hdev->advertising_old_state = old_state; return 0; } /* This function enables all user advertising instances */ static int hci_resume_advertising_sync(struct hci_dev *hdev) { struct adv_info *adv, *tmp; int err; /* If advertising has not been paused there is nothing to do. */ if (!hdev->advertising_paused) return 0; /* Resume directed advertising */ hdev->advertising_paused = false; if (hdev->advertising_old_state) { hci_dev_set_flag(hdev, HCI_ADVERTISING); hdev->advertising_old_state = 0; } bt_dev_dbg(hdev, "Resuming advertising instances"); if (ext_adv_capable(hdev)) { /* Call for each tracked instance to be re-enabled */ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) { err = hci_enable_ext_advertising_sync(hdev, adv->instance); if (!err) continue; /* If the instance cannot be resumed remove it */ hci_remove_ext_adv_instance_sync(hdev, adv->instance, NULL); } } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop */ err = hci_schedule_adv_instance_sync(hdev, hdev->cur_adv_instance, true); } hdev->advertising_paused = false; return err; } static int hci_pause_addr_resolution(struct hci_dev *hdev) { int err; if (!ll_privacy_capable(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) return 0; /* Cannot disable addr resolution if scanning is enabled or * when initiating an LE connection. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) || hci_lookup_le_connect(hdev)) { bt_dev_err(hdev, "Command not allowed when scan/LE connect"); return -EPERM; } /* Cannot disable addr resolution if advertising is enabled. */ err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "Pause advertising failed: %d", err); return err; } err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00); if (err) bt_dev_err(hdev, "Unable to disable Address Resolution: %d", err); /* Return if address resolution is disabled and RPA is not used. */ if (!err && scan_use_rpa(hdev)) return 0; hci_resume_advertising_sync(hdev); return err; } struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, bool extended, struct sock *sk) { u16 opcode = extended ? HCI_OP_READ_LOCAL_OOB_EXT_DATA : HCI_OP_READ_LOCAL_OOB_DATA; return __hci_cmd_sync_sk(hdev, opcode, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } static struct conn_params *conn_params_copy(struct list_head *list, size_t *n) { struct hci_conn_params *params; struct conn_params *p; size_t i; rcu_read_lock(); i = 0; list_for_each_entry_rcu(params, list, action) ++i; *n = i; rcu_read_unlock(); p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL); if (!p) return NULL; rcu_read_lock(); i = 0; list_for_each_entry_rcu(params, list, action) { /* Racing adds are handled in next scan update */ if (i >= *n) break; /* No hdev->lock, but: addr, addr_type are immutable. * privacy_mode is only written by us or in * hci_cc_le_set_privacy_mode that we wait for. * We should be idempotent so MGMT updating flags * while we are processing is OK. */ bacpy(&p[i].addr, &params->addr); p[i].addr_type = params->addr_type; p[i].flags = READ_ONCE(params->flags); p[i].privacy_mode = READ_ONCE(params->privacy_mode); ++i; } rcu_read_unlock(); *n = i; return p; } /* Clear LE Accept List */ static int hci_le_clear_accept_list_sync(struct hci_dev *hdev) { if (!(hdev->commands[26] & 0x80)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL, HCI_CMD_TIMEOUT); } /* Device must not be scanning when updating the accept list. * * Update is done using the following sequence: * * ll_privacy_capable((Disable Advertising) -> Disable Resolving List) -> * Remove Devices From Accept List -> * (has IRK && ll_privacy_capable(Remove Devices From Resolving List))-> * Add Devices to Accept List -> * (has IRK && ll_privacy_capable(Remove Devices From Resolving List)) -> * ll_privacy_capable(Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * * In case of failure advertising shall be restored to its original state and * return would disable accept list since either accept or resolving list could * not be programmed. * */ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) { struct conn_params *params; struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; u8 filter_policy; size_t i, n; int err; /* Pause advertising if resolving list can be used as controllers * cannot accept resolving list modifications while advertising. */ if (ll_privacy_capable(hdev)) { err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "pause advertising failed: %d", err); return 0x00; } } /* Disable address resolution while reprogramming accept list since * devices that do have an IRK will be programmed in the resolving list * when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00); if (err) { bt_dev_err(hdev, "Unable to disable LL privacy: %d", err); goto done; } /* Force address filtering if PA Sync is in progress */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { struct hci_cp_le_pa_create_sync *sent; sent = hci_sent_cmd_data(hdev, HCI_OP_LE_PA_CREATE_SYNC); if (sent) { struct conn_params pa; memset(&pa, 0, sizeof(pa)); bacpy(&pa.addr, &sent->addr); pa.addr_type = sent->addr_type; /* Clear first since there could be addresses left * behind. */ hci_le_clear_accept_list_sync(hdev); num_entries = 1; err = hci_le_add_accept_list_sync(hdev, &pa, &num_entries); goto done; } } /* Go through the current accept list programmed into the * controller one by one and check if that address is connected or is * still in the list of pending connections or list of devices to * report. If not present in either list, then remove it from * the controller. */ list_for_each_entry_safe(b, t, &hdev->le_accept_list, list) { if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type)) continue; /* Pointers not dereferenced, no locks needed */ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); pend_report = hci_pend_le_action_lookup(&hdev->pend_le_reports, &b->bdaddr, b->bdaddr_type); /* If the device is not likely to connect or report, * remove it from the acceptlist. */ if (!pend_conn && !pend_report) { hci_le_del_accept_list_sync(hdev, &b->bdaddr, b->bdaddr_type); continue; } num_entries++; } /* Since all no longer valid accept list entries have been * removed, walk through the list of pending connections * and ensure that any new device gets programmed into * the controller. * * If the list of the devices is larger than the list of * available accept list entries in the controller, then * just abort and return filer policy value to not use the * accept list. * * The list and params may be mutated while we wait for events, * so make a copy and iterate it. */ params = conn_params_copy(&hdev->pend_le_conns, &n); if (!params) { err = -ENOMEM; goto done; } for (i = 0; i < n; ++i) { err = hci_le_add_accept_list_sync(hdev, &params[i], &num_entries); if (err) { kvfree(params); goto done; } } kvfree(params); /* After adding all new pending connections, walk through * the list of pending reports and also add these to the * accept list if there is still space. Abort if space runs out. */ params = conn_params_copy(&hdev->pend_le_reports, &n); if (!params) { err = -ENOMEM; goto done; } for (i = 0; i < n; ++i) { err = hci_le_add_accept_list_sync(hdev, &params[i], &num_entries); if (err) { kvfree(params); goto done; } } kvfree(params); /* Use the allowlist unless the following conditions are all true: * - We are not currently suspending * - There are 1 or more ADV monitors registered and it's not offloaded * - Interleaved scanning is not currently using the allowlist */ if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended && hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE && hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST) err = -EINVAL; done: filter_policy = err ? 0x00 : 0x01; /* Enable address resolution when LL Privacy is enabled. */ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01); if (err) bt_dev_err(hdev, "Unable to enable LL privacy: %d", err); /* Resume advertising if it was paused */ if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ return filter_policy; } static void hci_le_scan_phy_params(struct hci_cp_le_scan_phy_params *cp, u8 type, u16 interval, u16 window) { cp->type = type; cp->interval = cpu_to_le16(interval); cp->window = cpu_to_le16(window); } static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { struct hci_cp_le_set_ext_scan_params *cp; struct hci_cp_le_scan_phy_params *phy; u8 data[sizeof(*cp) + sizeof(*phy) * 2]; u8 num_phy = 0x00; cp = (void *)data; phy = (void *)cp->data; memset(data, 0, sizeof(data)); cp->own_addr_type = own_addr_type; cp->filter_policy = filter_policy; /* Check if PA Sync is in progress then select the PHY based on the * hci_conn.iso_qos. */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { struct hci_cp_le_add_to_accept_list *sent; sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST); if (sent) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, ISO_LINK, &sent->bdaddr); if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; if (qos->bcast.in.phy & BT_ISO_PHY_1M || qos->bcast.in.phy & BT_ISO_PHY_2M) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, window); num_phy++; phy++; } if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } if (num_phy) goto done; } } } if (scan_1m(hdev) || scan_2m(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, window); num_phy++; phy++; } if (scan_coded(hdev)) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, window * 3); num_phy++; phy++; } done: if (!num_phy) return -EINVAL; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS, sizeof(*cp) + sizeof(*phy) * num_phy, data, HCI_CMD_TIMEOUT); } static int hci_le_set_scan_param_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy) { struct hci_cp_le_set_scan_param cp; if (use_ext_scan(hdev)) return hci_le_set_ext_scan_param_sync(hdev, type, interval, window, own_addr_type, filter_policy); memset(&cp, 0, sizeof(cp)); cp.type = type; cp.interval = cpu_to_le16(interval); cp.window = cpu_to_le16(window); cp.own_address_type = own_addr_type; cp.filter_policy = filter_policy; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_start_scan_sync(struct hci_dev *hdev, u8 type, u16 interval, u16 window, u8 own_addr_type, u8 filter_policy, u8 filter_dup) { int err; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_le_set_scan_param_sync(hdev, type, interval, window, own_addr_type, filter_policy); if (err) return err; return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE, filter_dup); } static int hci_passive_scan_sync(struct hci_dev *hdev) { u8 own_addr_type; u8 filter_policy; u16 window, interval; u8 filter_dups = LE_SCAN_FILTER_DUP_ENABLE; int err; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); return 0; } err = hci_scan_disable_sync(hdev); if (err) { bt_dev_err(hdev, "disable scanning failed: %d", err); return err; } /* Set require_privacy to false since no SCAN_REQ are send * during passive scanning. Not using an non-resolvable address * here is important so that peer devices using direct * advertising with our address will be correctly reported * by the controller. */ if (hci_update_random_address_sync(hdev, false, scan_use_rpa(hdev), &own_addr_type)) return 0; if (hdev->enable_advmon_interleave_scan && hci_update_interleaved_scan_sync(hdev)) return 0; bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state); /* Adding or removing entries from the accept list must * happen before enabling scanning. The controller does * not allow accept list modification while scanning. */ filter_policy = hci_update_accept_list_sync(hdev); /* If suspended and filter_policy set to 0x00 (no acceptlist) then * passive scanning cannot be started since that would require the host * to be woken up to process the reports. */ if (hdev->suspended && !filter_policy) { /* Check if accept list is empty then there is no need to scan * while suspended. */ if (list_empty(&hdev->le_accept_list)) return 0; /* If there are devices is the accept_list that means some * devices could not be programmed which in non-suspended case * means filter_policy needs to be set to 0x00 so the host needs * to filter, but since this is treating suspended case we * can ignore device needing host to filter to allow devices in * the acceptlist to be able to wakeup the system. */ filter_policy = 0x01; } /* When the controller is using random resolvable addresses and * with that having LE privacy enabled, then controllers with * Extended Scanner Filter Policies support can now enable support * for handling directed advertising. * * So instead of using filter polices 0x00 (no acceptlist) * and 0x01 (acceptlist enabled) use the new filter policies * 0x02 (no acceptlist) and 0x03 (acceptlist enabled). */ if (hci_dev_test_flag(hdev, HCI_PRIVACY) && (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)) filter_policy |= 0x02; if (hdev->suspended) { window = hdev->le_scan_window_suspend; interval = hdev->le_scan_int_suspend; } else if (hci_is_le_conn_scanning(hdev)) { window = hdev->le_scan_window_connect; interval = hdev->le_scan_int_connect; } else if (hci_is_adv_monitoring(hdev)) { window = hdev->le_scan_window_adv_monitor; interval = hdev->le_scan_int_adv_monitor; /* Disable duplicates filter when scanning for advertisement * monitor for the following reasons. * * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm * controllers ignore RSSI_Sampling_Period when the duplicates * filter is enabled. * * For SW pattern filtering, when we're not doing interleaved * scanning, it is necessary to disable duplicates filter, * otherwise hosts can only receive one advertisement and it's * impossible to know if a peer is still in range. */ filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } else { window = hdev->le_scan_window; interval = hdev->le_scan_interval; } /* Disable all filtering for Mesh */ if (hci_dev_test_flag(hdev, HCI_MESH)) { filter_policy = 0; filter_dups = LE_SCAN_FILTER_DUP_DISABLE; } bt_dev_dbg(hdev, "LE passive scan with acceptlist = %d", filter_policy); return hci_start_scan_sync(hdev, LE_SCAN_PASSIVE, interval, window, own_addr_type, filter_policy, filter_dups); } /* This function controls the passive scanning based on hdev->pend_le_conns * list. If there are pending LE connection we start the background scanning, * otherwise we stop it in the following sequence: * * If there are devices to scan: * * Disable Scanning -> Update Accept List -> * ll_privacy_capable((Disable Advertising) -> Disable Resolving List -> * Update Resolving List -> Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * * Otherwise: * * Disable Scanning */ int hci_update_passive_scan_sync(struct hci_dev *hdev) { int err; if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_AUTO_OFF) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* No point in doing scanning if LE support hasn't been enabled */ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; /* If discovery is active don't interfere with it */ if (hdev->discovery.state != DISCOVERY_STOPPED) return 0; /* Reset RSSI and UUID filters when starting background scanning * since these filters are meant for service discovery only. * * The Start Discovery and Start Service Discovery operations * ensure to set proper values for RSSI threshold and UUID * filter list. So it is safe to just reset them here. */ hci_discovery_filter_clear(hdev); bt_dev_dbg(hdev, "ADV monitoring is %s", hci_is_adv_monitoring(hdev) ? "on" : "off"); if (!hci_dev_test_flag(hdev, HCI_MESH) && list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports) && !hci_is_adv_monitoring(hdev) && !hci_dev_test_flag(hdev, HCI_PA_SYNC)) { /* If there is no pending LE connections or devices * to be scanned for or no ADV monitors, we should stop the * background scanning. */ bt_dev_dbg(hdev, "stopping background scanning"); err = hci_scan_disable_sync(hdev); if (err) bt_dev_err(hdev, "stop background scanning failed: %d", err); } else { /* If there is at least one pending LE connection, we should * keep the background scan running. */ /* If controller is connecting, we should not start scanning * since some controllers are not able to scan and connect at * the same time. */ if (hci_lookup_le_connect(hdev)) return 0; bt_dev_dbg(hdev, "start background scanning"); err = hci_passive_scan_sync(hdev); if (err) bt_dev_err(hdev, "start background scanning failed: %d", err); } return err; } static int update_scan_sync(struct hci_dev *hdev, void *data) { return hci_update_scan_sync(hdev); } int hci_update_scan(struct hci_dev *hdev) { return hci_cmd_sync_queue(hdev, update_scan_sync, NULL, NULL); } static int update_passive_scan_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); } int hci_update_passive_scan(struct hci_dev *hdev) { /* Only queue if it would have any effect */ if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || hci_dev_test_flag(hdev, HCI_SETUP) || hci_dev_test_flag(hdev, HCI_CONFIG) || hci_dev_test_flag(hdev, HCI_AUTO_OFF) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL, NULL); } int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val) { int err; if (!bredr_sc_enabled(hdev) || lmp_host_sc_capable(hdev)) return 0; err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT, sizeof(val), &val, HCI_CMD_TIMEOUT); if (!err) { if (val) { hdev->features[1][0] |= LMP_HOST_SC; hci_dev_set_flag(hdev, HCI_SC_ENABLED); } else { hdev->features[1][0] &= ~LMP_HOST_SC; hci_dev_clear_flag(hdev, HCI_SC_ENABLED); } } return err; } int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode) { int err; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) || lmp_host_ssp_capable(hdev)) return 0; if (!mode && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); if (err) return err; return hci_write_sc_support_sync(hdev, 0x01); } int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul) { struct hci_cp_write_le_host_supported cp; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) || !lmp_bredr_capable(hdev)) return 0; /* Check first if we already have the right host state * (host features set) */ if (le == lmp_host_le_capable(hdev) && simul == lmp_host_le_br_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); cp.le = le; cp.simul = simul; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_powered_update_adv_sync(struct hci_dev *hdev) { struct adv_info *adv, *tmp; int err; if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) return 0; /* If RPA Resolution has not been enable yet it means the * resolving list is empty and we should attempt to program the * local IRK in order to support using own_addr_type * ADDR_LE_DEV_RANDOM_RESOLVED (0x03). */ if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { hci_le_add_resolve_list_sync(hdev, NULL); hci_le_set_addr_resolution_enable_sync(hdev, 0x01); } /* Make sure the controller has a good default for * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) { err = hci_setup_ext_adv_instance_sync(hdev, 0x00); if (!err) hci_update_scan_rsp_data_sync(hdev, 0x00); } else { err = hci_update_adv_data_sync(hdev, 0x00); if (!err) hci_update_scan_rsp_data_sync(hdev, 0x00); } if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) hci_enable_advertising_sync(hdev); } /* Call for each tracked instance to be scheduled */ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) hci_schedule_adv_instance_sync(hdev, adv->instance, true); return 0; } static int hci_write_auth_enable_sync(struct hci_dev *hdev) { u8 link_sec; link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY); if (link_sec == test_bit(HCI_AUTH, &hdev->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE, sizeof(link_sec), &link_sec, HCI_CMD_TIMEOUT); } int hci_write_fast_connectable_sync(struct hci_dev *hdev, bool enable) { struct hci_cp_write_page_scan_activity cp; u8 type; int err = 0; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; memset(&cp, 0, sizeof(cp)); if (enable) { type = PAGE_SCAN_TYPE_INTERLACED; /* 160 msec page scan interval */ cp.interval = cpu_to_le16(0x0100); } else { type = hdev->def_page_scan_type; cp.interval = cpu_to_le16(hdev->def_page_scan_int); } cp.window = cpu_to_le16(hdev->def_page_scan_window); if (__cpu_to_le16(hdev->page_scan_interval) != cp.interval || __cpu_to_le16(hdev->page_scan_window) != cp.window) { err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) return err; } if (hdev->page_scan_type != type) err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE, sizeof(type), &type, HCI_CMD_TIMEOUT); return err; } static bool disconnected_accept_list_entries(struct hci_dev *hdev) { struct bdaddr_list *b; list_for_each_entry(b, &hdev->accept_list, list) { struct hci_conn *conn; conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); if (!conn) return true; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) return true; } return false; } static int hci_write_scan_enable_sync(struct hci_dev *hdev, u8 val) { return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE, sizeof(val), &val, HCI_CMD_TIMEOUT); } int hci_update_scan_sync(struct hci_dev *hdev) { u8 scan; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (!hdev_is_powered(hdev)) return 0; if (mgmt_powering_down(hdev)) return 0; if (hdev->scanning_paused) return 0; if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || disconnected_accept_list_entries(hdev)) scan = SCAN_PAGE; else scan = SCAN_DISABLED; if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) scan |= SCAN_INQUIRY; if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) && test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY)) return 0; return hci_write_scan_enable_sync(hdev, scan); } int hci_update_name_sync(struct hci_dev *hdev) { struct hci_cp_write_local_name cp; memset(&cp, 0, sizeof(cp)); memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* This function perform powered update HCI command sequence after the HCI init * sequence which end up resetting all states, the sequence is as follows: * * HCI_SSP_ENABLED(Enable SSP) * HCI_LE_ENABLED(Enable LE) * HCI_LE_ENABLED(ll_privacy_capable(Add local IRK to Resolving List) -> * Update adv data) * Enable Authentication * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class -> * Set Name -> Set EIR) * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address) */ int hci_powered_update_sync(struct hci_dev *hdev) { int err; /* Register the available SMP channels (BR/EDR and LE) only when * successfully powering on the controller. This late * registration is required so that LE SMP can clearly decide if * the public address or static address is used. */ smp_register(hdev); err = hci_write_ssp_mode_sync(hdev, 0x01); if (err) return err; err = hci_write_le_host_supported_sync(hdev, 0x01, 0x00); if (err) return err; err = hci_powered_update_adv_sync(hdev); if (err) return err; err = hci_write_auth_enable_sync(hdev); if (err) return err; if (lmp_bredr_capable(hdev)) { if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) hci_write_fast_connectable_sync(hdev, true); else hci_write_fast_connectable_sync(hdev, false); hci_update_scan_sync(hdev); hci_update_class_sync(hdev); hci_update_name_sync(hdev); hci_update_eir_sync(hdev); } /* If forcing static address is in use or there is no public * address use the static address as random address (but skip * the HCI command if the current random address is already the * static one. * * In case BR/EDR has been disabled on a dual-mode controller * and a static address has been configured, then use that * address instead of the public BR/EDR address. */ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) { if (bacmp(&hdev->static_addr, BDADDR_ANY)) return hci_set_random_addr_sync(hdev, &hdev->static_addr); } return 0; } /** * hci_dev_get_bd_addr_from_property - Get the Bluetooth Device Address * (BD_ADDR) for a HCI device from * a firmware node property. * @hdev: The HCI device * * Search the firmware node for 'local-bd-address'. * * All-zero BD addresses are rejected, because those could be properties * that exist in the firmware tables, but were not updated by the firmware. For * example, the DTS could define 'local-bd-address', with zero BD addresses. */ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) { struct fwnode_handle *fwnode = dev_fwnode(hdev->dev.parent); bdaddr_t ba; int ret; ret = fwnode_property_read_u8_array(fwnode, "local-bd-address", (u8 *)&ba, sizeof(ba)); if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) baswap(&hdev->public_addr, &ba); else bacpy(&hdev->public_addr, &ba); } struct hci_init_stage { int (*func)(struct hci_dev *hdev); }; /* Run init stage NULL terminated function table */ static int hci_init_stage_sync(struct hci_dev *hdev, const struct hci_init_stage *stage) { size_t i; for (i = 0; stage[i].func; i++) { int err; err = stage[i].func(hdev); if (err) return err; } return 0; } /* Read Local Version */ static int hci_read_local_version_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_VERSION, 0, NULL, HCI_CMD_TIMEOUT); } /* Read BD Address */ static int hci_read_bd_addr_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_BD_ADDR, 0, NULL, HCI_CMD_TIMEOUT); } #define HCI_INIT(_func) \ { \ .func = _func, \ } static const struct hci_init_stage hci_init0[] = { /* HCI_OP_READ_LOCAL_VERSION */ HCI_INIT(hci_read_local_version_sync), /* HCI_OP_READ_BD_ADDR */ HCI_INIT(hci_read_bd_addr_sync), {} }; int hci_reset_sync(struct hci_dev *hdev) { int err; set_bit(HCI_RESET, &hdev->flags); err = __hci_cmd_sync_status(hdev, HCI_OP_RESET, 0, NULL, HCI_CMD_TIMEOUT); if (err) return err; return 0; } static int hci_init0_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); /* Reset */ if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { err = hci_reset_sync(hdev); if (err) return err; } return hci_init_stage_sync(hdev, hci_init0); } static int hci_unconf_init_sync(struct hci_dev *hdev) { int err; if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return 0; err = hci_init0_sync(hdev); if (err < 0) return err; if (hci_dev_test_flag(hdev, HCI_SETUP)) hci_debugfs_create_basic(hdev); return 0; } /* Read Local Supported Features. */ static int hci_read_local_features_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_FEATURES, 0, NULL, HCI_CMD_TIMEOUT); } /* BR Controller init stage 1 command sequence */ static const struct hci_init_stage br_init1[] = { /* HCI_OP_READ_LOCAL_FEATURES */ HCI_INIT(hci_read_local_features_sync), /* HCI_OP_READ_LOCAL_VERSION */ HCI_INIT(hci_read_local_version_sync), /* HCI_OP_READ_BD_ADDR */ HCI_INIT(hci_read_bd_addr_sync), {} }; /* Read Local Commands */ static int hci_read_local_cmds_sync(struct hci_dev *hdev) { /* All Bluetooth 1.2 and later controllers should support the * HCI command for reading the local supported commands. * * Unfortunately some controllers indicate Bluetooth 1.2 support, * but do not have support for this command. If that is the case, * the driver can quirk the behavior and skip reading the local * supported commands. */ if (hdev->hci_ver > BLUETOOTH_VER_1_1 && !test_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks)) return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL, HCI_CMD_TIMEOUT); return 0; } static int hci_init1_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); /* Reset */ if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { err = hci_reset_sync(hdev); if (err) return err; } return hci_init_stage_sync(hdev, br_init1); } /* Read Buffer Size (ACL mtu, max pkt, etc.) */ static int hci_read_buffer_size_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Class of Device */ static int hci_read_dev_class_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLASS_OF_DEV, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Local Name */ static int hci_read_local_name_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_NAME, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Voice Setting */ static int hci_read_voice_setting_sync(struct hci_dev *hdev) { if (!read_voice_setting_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Number of Supported IAC */ static int hci_read_num_supported_iac_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_NUM_SUPPORTED_IAC, 0, NULL, HCI_CMD_TIMEOUT); } /* Read Current IAC LAP */ static int hci_read_current_iac_lap_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_READ_CURRENT_IAC_LAP, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type, u8 cond_type, bdaddr_t *bdaddr, u8 auto_accept) { struct hci_cp_set_event_filter cp; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) return 0; memset(&cp, 0, sizeof(cp)); cp.flt_type = flt_type; if (flt_type != HCI_FLT_CLEAR_ALL) { cp.cond_type = cond_type; bacpy(&cp.addr_conn_flt.bdaddr, bdaddr); cp.addr_conn_flt.auto_accept = auto_accept; } return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_FLT, flt_type == HCI_FLT_CLEAR_ALL ? sizeof(cp.flt_type) : sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_clear_event_filter_sync(struct hci_dev *hdev) { if (!hci_dev_test_flag(hdev, HCI_EVENT_FILTER_CONFIGURED)) return 0; /* In theory the state machine should not reach here unless * a hci_set_event_filter_sync() call succeeds, but we do * the check both for parity and as a future reminder. */ if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) return 0; return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00, BDADDR_ANY, 0x00); } /* Connection accept timeout ~20 secs */ static int hci_write_ca_timeout_sync(struct hci_dev *hdev) { __le16 param = cpu_to_le16(0x7d00); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CA_TIMEOUT, sizeof(param), &param, HCI_CMD_TIMEOUT); } /* Enable SCO flow control if supported */ static int hci_write_sync_flowctl_sync(struct hci_dev *hdev) { struct hci_cp_write_sync_flowctl cp; int err; /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */ if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) || !test_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks)) return 0; memset(&cp, 0, sizeof(cp)); cp.enable = 0x01; err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SYNC_FLOWCTL, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (!err) hci_dev_set_flag(hdev, HCI_SCO_FLOWCTL); return err; } /* BR Controller init stage 2 command sequence */ static const struct hci_init_stage br_init2[] = { /* HCI_OP_READ_BUFFER_SIZE */ HCI_INIT(hci_read_buffer_size_sync), /* HCI_OP_READ_CLASS_OF_DEV */ HCI_INIT(hci_read_dev_class_sync), /* HCI_OP_READ_LOCAL_NAME */ HCI_INIT(hci_read_local_name_sync), /* HCI_OP_READ_VOICE_SETTING */ HCI_INIT(hci_read_voice_setting_sync), /* HCI_OP_READ_NUM_SUPPORTED_IAC */ HCI_INIT(hci_read_num_supported_iac_sync), /* HCI_OP_READ_CURRENT_IAC_LAP */ HCI_INIT(hci_read_current_iac_lap_sync), /* HCI_OP_SET_EVENT_FLT */ HCI_INIT(hci_clear_event_filter_sync), /* HCI_OP_WRITE_CA_TIMEOUT */ HCI_INIT(hci_write_ca_timeout_sync), /* HCI_OP_WRITE_SYNC_FLOWCTL */ HCI_INIT(hci_write_sync_flowctl_sync), {} }; static int hci_write_ssp_mode_1_sync(struct hci_dev *hdev) { u8 mode = 0x01; if (!lmp_ssp_capable(hdev) || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; /* When SSP is available, then the host features page * should also be available as well. However some * controllers list the max_page as 0 as long as SSP * has not been enabled. To achieve proper debugging * output, force the minimum max_page to 1 at least. */ hdev->max_page = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } static int hci_write_eir_sync(struct hci_dev *hdev) { struct hci_cp_write_eir cp; if (!lmp_ssp_capable(hdev) || hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) return 0; memset(hdev->eir, 0, sizeof(hdev->eir)); memset(&cp, 0, sizeof(cp)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_write_inquiry_mode_sync(struct hci_dev *hdev) { u8 mode; if (!lmp_inq_rssi_capable(hdev) && !test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) return 0; /* If Extended Inquiry Result events are supported, then * they are clearly preferred over Inquiry Result with RSSI * events. */ mode = lmp_ext_inq_capable(hdev) ? 0x02 : 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_INQUIRY_MODE, sizeof(mode), &mode, HCI_CMD_TIMEOUT); } static int hci_read_inq_rsp_tx_power_sync(struct hci_dev *hdev) { if (!lmp_inq_tx_pwr_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_local_ext_features_sync(struct hci_dev *hdev, u8 page) { struct hci_cp_read_local_ext_features cp; if (!lmp_ext_feat_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); cp.page = page; return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_read_local_ext_features_1_sync(struct hci_dev *hdev) { return hci_read_local_ext_features_sync(hdev, 0x01); } /* HCI Controller init stage 2 command sequence */ static const struct hci_init_stage hci_init2[] = { /* HCI_OP_READ_LOCAL_COMMANDS */ HCI_INIT(hci_read_local_cmds_sync), /* HCI_OP_WRITE_SSP_MODE */ HCI_INIT(hci_write_ssp_mode_1_sync), /* HCI_OP_WRITE_EIR */ HCI_INIT(hci_write_eir_sync), /* HCI_OP_WRITE_INQUIRY_MODE */ HCI_INIT(hci_write_inquiry_mode_sync), /* HCI_OP_READ_INQ_RSP_TX_POWER */ HCI_INIT(hci_read_inq_rsp_tx_power_sync), /* HCI_OP_READ_LOCAL_EXT_FEATURES */ HCI_INIT(hci_read_local_ext_features_1_sync), /* HCI_OP_WRITE_AUTH_ENABLE */ HCI_INIT(hci_write_auth_enable_sync), {} }; /* Read LE Buffer Size */ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) { /* Use Read LE Buffer Size V2 if supported */ if (iso_capable(hdev) && hdev->commands[41] & 0x20) return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE_V2, 0, NULL, HCI_CMD_TIMEOUT); return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Local Supported Features */ static int hci_le_read_local_features_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Supported States */ static int hci_le_read_supported_states_sync(struct hci_dev *hdev) { return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL, HCI_CMD_TIMEOUT); } /* LE Controller init stage 2 command sequence */ static const struct hci_init_stage le_init2[] = { /* HCI_OP_LE_READ_LOCAL_FEATURES */ HCI_INIT(hci_le_read_local_features_sync), /* HCI_OP_LE_READ_BUFFER_SIZE */ HCI_INIT(hci_le_read_buffer_size_sync), /* HCI_OP_LE_READ_SUPPORTED_STATES */ HCI_INIT(hci_le_read_supported_states_sync), {} }; static int hci_init2_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init2); if (err) return err; if (lmp_bredr_capable(hdev)) { err = hci_init_stage_sync(hdev, br_init2); if (err) return err; } else { hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); } if (lmp_le_capable(hdev)) { err = hci_init_stage_sync(hdev, le_init2); if (err) return err; /* LE-only controllers have LE implicitly enabled */ if (!lmp_bredr_capable(hdev)) hci_dev_set_flag(hdev, HCI_LE_ENABLED); } return 0; } static int hci_set_event_mask_sync(struct hci_dev *hdev) { /* The second byte is 0xff instead of 0x9f (two reserved bits * disabled) since a Broadcom 1.2 dongle doesn't respond to the * command otherwise. */ u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 }; /* CSR 1.1 dongles does not accept any bitfield so don't try to set * any event mask for pre 1.2 devices. */ if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; if (lmp_bredr_capable(hdev)) { events[4] |= 0x01; /* Flow Specification Complete */ /* Don't set Disconnect Complete and mode change when * suspended as that would wakeup the host when disconnecting * due to suspend. */ if (hdev->suspended) { events[0] &= 0xef; events[2] &= 0xf7; } } else { /* Use a different default for LE-only devices */ memset(events, 0, sizeof(events)); events[1] |= 0x20; /* Command Complete */ events[1] |= 0x40; /* Command Status */ events[1] |= 0x80; /* Hardware Error */ /* If the controller supports the Disconnect command, enable * the corresponding event. In addition enable packet flow * control related events. */ if (hdev->commands[0] & 0x20) { /* Don't set Disconnect Complete when suspended as that * would wakeup the host when disconnecting due to * suspend. */ if (!hdev->suspended) events[0] |= 0x10; /* Disconnection Complete */ events[2] |= 0x04; /* Number of Completed Packets */ events[3] |= 0x02; /* Data Buffer Overflow */ } /* If the controller supports the Read Remote Version * Information command, enable the corresponding event. */ if (hdev->commands[2] & 0x80) events[1] |= 0x08; /* Read Remote Version Information * Complete */ if (hdev->le_features[0] & HCI_LE_ENCRYPTION) { events[0] |= 0x80; /* Encryption Change */ events[5] |= 0x80; /* Encryption Key Refresh Complete */ } } if (lmp_inq_rssi_capable(hdev) || test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) events[4] |= 0x02; /* Inquiry Result with RSSI */ if (lmp_ext_feat_capable(hdev)) events[4] |= 0x04; /* Read Remote Extended Features Complete */ if (lmp_esco_capable(hdev)) { events[5] |= 0x08; /* Synchronous Connection Complete */ events[5] |= 0x10; /* Synchronous Connection Changed */ } if (lmp_sniffsubr_capable(hdev)) events[5] |= 0x20; /* Sniff Subrating */ if (lmp_pause_enc_capable(hdev)) events[5] |= 0x80; /* Encryption Key Refresh Complete */ if (lmp_ext_inq_capable(hdev)) events[5] |= 0x40; /* Extended Inquiry Result */ if (lmp_no_flush_capable(hdev)) events[7] |= 0x01; /* Enhanced Flush Complete */ if (lmp_lsto_capable(hdev)) events[6] |= 0x80; /* Link Supervision Timeout Changed */ if (lmp_ssp_capable(hdev)) { events[6] |= 0x01; /* IO Capability Request */ events[6] |= 0x02; /* IO Capability Response */ events[6] |= 0x04; /* User Confirmation Request */ events[6] |= 0x08; /* User Passkey Request */ events[6] |= 0x10; /* Remote OOB Data Request */ events[6] |= 0x20; /* Simple Pairing Complete */ events[7] |= 0x04; /* User Passkey Notification */ events[7] |= 0x08; /* Keypress Notification */ events[7] |= 0x10; /* Remote Host Supported * Features Notification */ } if (lmp_le_capable(hdev)) events[7] |= 0x20; /* LE Meta-Event */ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } static int hci_read_stored_link_key_sync(struct hci_dev *hdev) { struct hci_cp_read_stored_link_key cp; if (!(hdev->commands[6] & 0x20) || test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) return 0; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, BDADDR_ANY); cp.read_all = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_READ_STORED_LINK_KEY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_setup_link_policy_sync(struct hci_dev *hdev) { struct hci_cp_write_def_link_policy cp; u16 link_policy = 0; if (!(hdev->commands[5] & 0x10)) return 0; memset(&cp, 0, sizeof(cp)); if (lmp_rswitch_capable(hdev)) link_policy |= HCI_LP_RSWITCH; if (lmp_hold_capable(hdev)) link_policy |= HCI_LP_HOLD; if (lmp_sniff_capable(hdev)) link_policy |= HCI_LP_SNIFF; if (lmp_park_capable(hdev)) link_policy |= HCI_LP_PARK; cp.policy = cpu_to_le16(link_policy); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_read_page_scan_activity_sync(struct hci_dev *hdev) { if (!(hdev->commands[8] & 0x01)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev) { if (!(hdev->commands[18] & 0x04) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_read_page_scan_type_sync(struct hci_dev *hdev) { /* Some older Broadcom based Bluetooth 1.2 controllers do not * support the Read Page Scan Type command. Check support for * this command in the bit mask of supported commands. */ if (!(hdev->commands[13] & 0x01) || test_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read features beyond page 1 if available */ static int hci_read_local_ext_features_all_sync(struct hci_dev *hdev) { u8 page; int err; if (!lmp_ext_feat_capable(hdev)) return 0; for (page = 2; page < HCI_MAX_PAGES && page <= hdev->max_page; page++) { err = hci_read_local_ext_features_sync(hdev, page); if (err) return err; } return 0; } /* HCI Controller init stage 3 command sequence */ static const struct hci_init_stage hci_init3[] = { /* HCI_OP_SET_EVENT_MASK */ HCI_INIT(hci_set_event_mask_sync), /* HCI_OP_READ_STORED_LINK_KEY */ HCI_INIT(hci_read_stored_link_key_sync), /* HCI_OP_WRITE_DEF_LINK_POLICY */ HCI_INIT(hci_setup_link_policy_sync), /* HCI_OP_READ_PAGE_SCAN_ACTIVITY */ HCI_INIT(hci_read_page_scan_activity_sync), /* HCI_OP_READ_DEF_ERR_DATA_REPORTING */ HCI_INIT(hci_read_def_err_data_reporting_sync), /* HCI_OP_READ_PAGE_SCAN_TYPE */ HCI_INIT(hci_read_page_scan_type_sync), /* HCI_OP_READ_LOCAL_EXT_FEATURES */ HCI_INIT(hci_read_local_ext_features_all_sync), {} }; static int hci_le_set_event_mask_sync(struct hci_dev *hdev) { u8 events[8]; if (!lmp_le_capable(hdev)) return 0; memset(events, 0, sizeof(events)); if (hdev->le_features[0] & HCI_LE_ENCRYPTION) events[0] |= 0x10; /* LE Long Term Key Request */ /* If controller supports the Connection Parameters Request * Link Layer Procedure, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC) /* LE Remote Connection Parameter Request */ events[0] |= 0x20; /* If the controller supports the Data Length Extension * feature, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ /* If the controller supports LL Privacy feature or LE Extended Adv, * enable the corresponding event. */ if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ /* Mark Device Privacy if Privacy Mode is supported */ if (privacy_mode_capable(hdev)) hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY; /* Mark Address Resolution if LL Privacy is supported */ if (ll_privacy_capable(hdev)) hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION; /* If the controller supports Extended Scanner Filter * Policies, enable the corresponding event. */ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY) events[1] |= 0x04; /* LE Direct Advertising Report */ /* If the controller supports Channel Selection Algorithm #2 * feature, enable the corresponding event. */ if (hdev->le_features[1] & HCI_LE_CHAN_SEL_ALG2) events[2] |= 0x08; /* LE Channel Selection Algorithm */ /* If the controller supports the LE Set Scan Enable command, * enable the corresponding advertising report event. */ if (hdev->commands[26] & 0x08) events[0] |= 0x02; /* LE Advertising Report */ /* If the controller supports the LE Create Connection * command, enable the corresponding event. */ if (hdev->commands[26] & 0x10) events[0] |= 0x01; /* LE Connection Complete */ /* If the controller supports the LE Connection Update * command, enable the corresponding event. */ if (hdev->commands[27] & 0x04) events[0] |= 0x04; /* LE Connection Update Complete */ /* If the controller supports the LE Read Remote Used Features * command, enable the corresponding event. */ if (hdev->commands[27] & 0x20) /* LE Read Remote Used Features Complete */ events[0] |= 0x08; /* If the controller supports the LE Read Local P-256 * Public Key command, enable the corresponding event. */ if (hdev->commands[34] & 0x02) /* LE Read Local P-256 Public Key Complete */ events[0] |= 0x80; /* If the controller supports the LE Generate DHKey * command, enable the corresponding event. */ if (hdev->commands[34] & 0x04) events[1] |= 0x01; /* LE Generate DHKey Complete */ /* If the controller supports the LE Set Default PHY or * LE Set PHY commands, enable the corresponding event. */ if (hdev->commands[35] & (0x20 | 0x40)) events[1] |= 0x08; /* LE PHY Update Complete */ /* If the controller supports LE Set Extended Scan Parameters * and LE Set Extended Scan Enable commands, enable the * corresponding event. */ if (use_ext_scan(hdev)) events[1] |= 0x10; /* LE Extended Advertising Report */ /* If the controller supports the LE Extended Advertising * command, enable the corresponding event. */ if (ext_adv_capable(hdev)) events[2] |= 0x02; /* LE Advertising Set Terminated */ if (cis_capable(hdev)) { events[3] |= 0x01; /* LE CIS Established */ if (cis_peripheral_capable(hdev)) events[3] |= 0x02; /* LE CIS Request */ } if (bis_capable(hdev)) { events[1] |= 0x20; /* LE PA Report */ events[1] |= 0x40; /* LE PA Sync Established */ events[3] |= 0x04; /* LE Create BIG Complete */ events[3] |= 0x08; /* LE Terminate BIG Complete */ events[3] |= 0x10; /* LE BIG Sync Established */ events[3] |= 0x20; /* LE BIG Sync Loss */ events[4] |= 0x02; /* LE BIG Info Advertising Report */ } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } /* Read LE Advertising Channel TX Power */ static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev) { if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) { /* HCI TS spec forbids mixing of legacy and extended * advertising commands wherein READ_ADV_TX_POWER is * also included. So do not call it if extended adv * is supported otherwise controller will return * COMMAND_DISALLOWED for extended commands. */ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL, HCI_CMD_TIMEOUT); } return 0; } /* Read LE Min/Max Tx Power*/ static int hci_le_read_tx_power_sync(struct hci_dev *hdev) { if (!(hdev->commands[38] & 0x80) || test_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Accept List Size */ static int hci_le_read_accept_list_size_sync(struct hci_dev *hdev) { if (!(hdev->commands[26] & 0x40)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ACCEPT_LIST_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Resolving List Size */ static int hci_le_read_resolv_list_size_sync(struct hci_dev *hdev) { if (!(hdev->commands[34] & 0x40)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_RESOLV_LIST_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } /* Clear LE Resolving List */ static int hci_le_clear_resolv_list_sync(struct hci_dev *hdev) { if (!(hdev->commands[34] & 0x20)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL, HCI_CMD_TIMEOUT); } /* Set RPA timeout */ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev) { __le16 timeout = cpu_to_le16(hdev->rpa_timeout); if (!(hdev->commands[35] & 0x04) || test_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT, sizeof(timeout), &timeout, HCI_CMD_TIMEOUT); } /* Read LE Maximum Data Length */ static int hci_le_read_max_data_len_sync(struct hci_dev *hdev) { if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Suggested Default Data Length */ static int hci_le_read_def_data_len_sync(struct hci_dev *hdev) { if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL, HCI_CMD_TIMEOUT); } /* Read LE Number of Supported Advertising Sets */ static int hci_le_read_num_support_adv_sets_sync(struct hci_dev *hdev) { if (!ext_adv_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS, 0, NULL, HCI_CMD_TIMEOUT); } /* Write LE Host Supported */ static int hci_set_le_support_sync(struct hci_dev *hdev) { struct hci_cp_write_le_host_supported cp; /* LE-only devices do not support explicit enablement */ if (!lmp_bredr_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { cp.le = 0x01; cp.simul = 0x00; } if (cp.le == lmp_host_le_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* LE Set Host Feature */ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; if (!cis_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; cp.bit_value = 1; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* LE Controller init stage 3 command sequence */ static const struct hci_init_stage le_init3[] = { /* HCI_OP_LE_SET_EVENT_MASK */ HCI_INIT(hci_le_set_event_mask_sync), /* HCI_OP_LE_READ_ADV_TX_POWER */ HCI_INIT(hci_le_read_adv_tx_power_sync), /* HCI_OP_LE_READ_TRANSMIT_POWER */ HCI_INIT(hci_le_read_tx_power_sync), /* HCI_OP_LE_READ_ACCEPT_LIST_SIZE */ HCI_INIT(hci_le_read_accept_list_size_sync), /* HCI_OP_LE_CLEAR_ACCEPT_LIST */ HCI_INIT(hci_le_clear_accept_list_sync), /* HCI_OP_LE_READ_RESOLV_LIST_SIZE */ HCI_INIT(hci_le_read_resolv_list_size_sync), /* HCI_OP_LE_CLEAR_RESOLV_LIST */ HCI_INIT(hci_le_clear_resolv_list_sync), /* HCI_OP_LE_SET_RPA_TIMEOUT */ HCI_INIT(hci_le_set_rpa_timeout_sync), /* HCI_OP_LE_READ_MAX_DATA_LEN */ HCI_INIT(hci_le_read_max_data_len_sync), /* HCI_OP_LE_READ_DEF_DATA_LEN */ HCI_INIT(hci_le_read_def_data_len_sync), /* HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS */ HCI_INIT(hci_le_read_num_support_adv_sets_sync), /* HCI_OP_WRITE_LE_HOST_SUPPORTED */ HCI_INIT(hci_set_le_support_sync), /* HCI_OP_LE_SET_HOST_FEATURE */ HCI_INIT(hci_le_set_host_feature_sync), {} }; static int hci_init3_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init3); if (err) return err; if (lmp_le_capable(hdev)) return hci_init_stage_sync(hdev, le_init3); return 0; } static int hci_delete_stored_link_key_sync(struct hci_dev *hdev) { struct hci_cp_delete_stored_link_key cp; /* Some Broadcom based Bluetooth controllers do not support the * Delete Stored Link Key command. They are clearly indicating its * absence in the bit mask of supported commands. * * Check the supported commands and only if the command is marked * as supported send it. If not supported assume that the controller * does not have actual support for stored link keys which makes this * command redundant anyway. * * Some controllers indicate that they support handling deleting * stored link keys, but they don't. The quirk lets a driver * just disable this command. */ if (!(hdev->commands[6] & 0x80) || test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) return 0; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, BDADDR_ANY); cp.delete_all = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_DELETE_STORED_LINK_KEY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev) { u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; bool changed = false; /* Set event mask page 2 if the HCI command for it is supported */ if (!(hdev->commands[22] & 0x04)) return 0; /* If Connectionless Peripheral Broadcast central role is supported * enable all necessary events for it. */ if (lmp_cpb_central_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ events[2] |= 0x08; /* Truncated Page Complete */ events[2] |= 0x20; /* CPB Channel Map Change */ changed = true; } /* If Connectionless Peripheral Broadcast peripheral role is supported * enable all necessary events for it. */ if (lmp_cpb_peripheral_capable(hdev)) { events[2] |= 0x01; /* Synchronization Train Received */ events[2] |= 0x02; /* CPB Receive */ events[2] |= 0x04; /* CPB Timeout */ events[2] |= 0x10; /* Peripheral Page Response Timeout */ changed = true; } /* Enable Authenticated Payload Timeout Expired event if supported */ if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING) { events[2] |= 0x80; changed = true; } /* Some Broadcom based controllers indicate support for Set Event * Mask Page 2 command, but then actually do not support it. Since * the default value is all bits set to zero, the command is only * required if the event mask has to be changed. In case no change * to the event mask is needed, skip this command. */ if (!changed) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events, HCI_CMD_TIMEOUT); } /* Read local codec list if the HCI command is supported */ static int hci_read_local_codecs_sync(struct hci_dev *hdev) { if (hdev->commands[45] & 0x04) hci_read_supported_codecs_v2(hdev); else if (hdev->commands[29] & 0x20) hci_read_supported_codecs(hdev); return 0; } /* Read local pairing options if the HCI command is supported */ static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev) { if (!(hdev->commands[41] & 0x08)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL, HCI_CMD_TIMEOUT); } /* Get MWS transport configuration if the HCI command is supported */ static int hci_get_mws_transport_config_sync(struct hci_dev *hdev) { if (!mws_transport_config_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL, HCI_CMD_TIMEOUT); } /* Check for Synchronization Train support */ static int hci_read_sync_train_params_sync(struct hci_dev *hdev) { if (!lmp_sync_train_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL, HCI_CMD_TIMEOUT); } /* Enable Secure Connections if supported and configured */ static int hci_write_sc_support_1_sync(struct hci_dev *hdev) { u8 support = 0x01; if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) || !bredr_sc_enabled(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT, sizeof(support), &support, HCI_CMD_TIMEOUT); } /* Set erroneous data reporting if supported to the wideband speech * setting value */ static int hci_set_err_data_report_sync(struct hci_dev *hdev) { struct hci_cp_write_def_err_data_reporting cp; bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED); if (!(hdev->commands[18] & 0x08) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) return 0; if (enabled == hdev->err_data_reporting) return 0; memset(&cp, 0, sizeof(cp)); cp.err_data_reporting = enabled ? ERR_DATA_REPORTING_ENABLED : ERR_DATA_REPORTING_DISABLED; return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static const struct hci_init_stage hci_init4[] = { /* HCI_OP_DELETE_STORED_LINK_KEY */ HCI_INIT(hci_delete_stored_link_key_sync), /* HCI_OP_SET_EVENT_MASK_PAGE_2 */ HCI_INIT(hci_set_event_mask_page_2_sync), /* HCI_OP_READ_LOCAL_CODECS */ HCI_INIT(hci_read_local_codecs_sync), /* HCI_OP_READ_LOCAL_PAIRING_OPTS */ HCI_INIT(hci_read_local_pairing_opts_sync), /* HCI_OP_GET_MWS_TRANSPORT_CONFIG */ HCI_INIT(hci_get_mws_transport_config_sync), /* HCI_OP_READ_SYNC_TRAIN_PARAMS */ HCI_INIT(hci_read_sync_train_params_sync), /* HCI_OP_WRITE_SC_SUPPORT */ HCI_INIT(hci_write_sc_support_1_sync), /* HCI_OP_WRITE_DEF_ERR_DATA_REPORTING */ HCI_INIT(hci_set_err_data_report_sync), {} }; /* Set Suggested Default Data Length to maximum if supported */ static int hci_le_set_write_def_data_len_sync(struct hci_dev *hdev) { struct hci_cp_le_write_def_data_len cp; if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)) return 0; memset(&cp, 0, sizeof(cp)); cp.tx_len = cpu_to_le16(hdev->le_max_tx_len); cp.tx_time = cpu_to_le16(hdev->le_max_tx_time); return __hci_cmd_sync_status(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } /* Set Default PHY parameters if command is supported, enables all supported * PHYs according to the LE Features bits. */ static int hci_le_set_default_phy_sync(struct hci_dev *hdev) { struct hci_cp_le_set_default_phy cp; if (!(hdev->commands[35] & 0x20)) { /* If the command is not supported it means only 1M PHY is * supported. */ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; return 0; } memset(&cp, 0, sizeof(cp)); cp.all_phys = 0x00; cp.tx_phys = HCI_LE_SET_PHY_1M; cp.rx_phys = HCI_LE_SET_PHY_1M; /* Enables 2M PHY if supported */ if (le_2m_capable(hdev)) { cp.tx_phys |= HCI_LE_SET_PHY_2M; cp.rx_phys |= HCI_LE_SET_PHY_2M; } /* Enables Coded PHY if supported */ if (le_coded_capable(hdev)) { cp.tx_phys |= HCI_LE_SET_PHY_CODED; cp.rx_phys |= HCI_LE_SET_PHY_CODED; } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static const struct hci_init_stage le_init4[] = { /* HCI_OP_LE_WRITE_DEF_DATA_LEN */ HCI_INIT(hci_le_set_write_def_data_len_sync), /* HCI_OP_LE_SET_DEFAULT_PHY */ HCI_INIT(hci_le_set_default_phy_sync), {} }; static int hci_init4_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_init_stage_sync(hdev, hci_init4); if (err) return err; if (lmp_le_capable(hdev)) return hci_init_stage_sync(hdev, le_init4); return 0; } static int hci_init_sync(struct hci_dev *hdev) { int err; err = hci_init1_sync(hdev); if (err < 0) return err; if (hci_dev_test_flag(hdev, HCI_SETUP)) hci_debugfs_create_basic(hdev); err = hci_init2_sync(hdev); if (err < 0) return err; err = hci_init3_sync(hdev); if (err < 0) return err; err = hci_init4_sync(hdev); if (err < 0) return err; /* This function is only called when the controller is actually in * configured state. When the controller is marked as unconfigured, * this initialization procedure is not run. * * It means that it is possible that a controller runs through its * setup phase and then discovers missing settings. If that is the * case, then this function will not be called. It then will only * be called during the config phase. * * So only when in setup phase or config phase, create the debugfs * entries and register the SMP channels. */ if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) return 0; if (hci_dev_test_and_set_flag(hdev, HCI_DEBUGFS_CREATED)) return 0; hci_debugfs_create_common(hdev); if (lmp_bredr_capable(hdev)) hci_debugfs_create_bredr(hdev); if (lmp_le_capable(hdev)) hci_debugfs_create_le(hdev); return 0; } #define HCI_QUIRK_BROKEN(_quirk, _desc) { HCI_QUIRK_BROKEN_##_quirk, _desc } static const struct { unsigned long quirk; const char *desc; } hci_broken_table[] = { HCI_QUIRK_BROKEN(LOCAL_COMMANDS, "HCI Read Local Supported Commands not supported"), HCI_QUIRK_BROKEN(STORED_LINK_KEY, "HCI Delete Stored Link Key command is advertised, " "but not supported."), HCI_QUIRK_BROKEN(ERR_DATA_REPORTING, "HCI Read Default Erroneous Data Reporting command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER, "HCI Read Transmit Power Level command is advertised, " "but not supported."), HCI_QUIRK_BROKEN(FILTER_CLEAR_ALL, "HCI Set Event Filter command not supported."), HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN, "HCI Enhanced Setup Synchronous Connection command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT, "HCI LE Set Random Private Address Timeout command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(EXT_CREATE_CONN, "HCI LE Extended Create Connection command is " "advertised, but not supported."), HCI_QUIRK_BROKEN(WRITE_AUTH_PAYLOAD_TIMEOUT, "HCI WRITE AUTH PAYLOAD TIMEOUT command leads " "to unexpected SMP errors when pairing " "and will not be used."), HCI_QUIRK_BROKEN(LE_CODED, "HCI LE Coded PHY feature bit is set, " "but its usage is not supported.") }; /* This function handles hdev setup stage: * * Calls hdev->setup * Setup address if HCI_QUIRK_USE_BDADDR_PROPERTY is set. */ static int hci_dev_setup_sync(struct hci_dev *hdev) { int ret = 0; bool invalid_bdaddr; size_t i; if (!hci_dev_test_flag(hdev, HCI_SETUP) && !test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) return 0; bt_dev_dbg(hdev, ""); hci_sock_dev_event(hdev, HCI_DEV_SETUP); if (hdev->setup) ret = hdev->setup(hdev); for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) { if (test_bit(hci_broken_table[i].quirk, &hdev->quirks)) bt_dev_warn(hdev, "%s", hci_broken_table[i].desc); } /* The transport driver can set the quirk to mark the * BD_ADDR invalid before creating the HCI device or in * its setup callback. */ invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); if (!ret) { if (test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks) && !bacmp(&hdev->public_addr, BDADDR_ANY)) hci_dev_get_bd_addr_from_property(hdev); if (invalid_bdaddr && bacmp(&hdev->public_addr, BDADDR_ANY) && hdev->set_bdaddr) { ret = hdev->set_bdaddr(hdev, &hdev->public_addr); if (!ret) invalid_bdaddr = false; } } /* The transport driver can set these quirks before * creating the HCI device or in its setup callback. * * For the invalid BD_ADDR quirk it is possible that * it becomes a valid address if the bootloader does * provide it (see above). * * In case any of them is set, the controller has to * start up as unconfigured. */ if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || invalid_bdaddr) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* For an unconfigured controller it is required to * read at least the version information provided by * the Read Local Version Information command. * * If the set_bdaddr driver callback is provided, then * also the original Bluetooth public device address * will be read using the Read BD Address command. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return hci_unconf_init_sync(hdev); return ret; } /* This function handles hdev init stage: * * Calls hci_dev_setup_sync to perform setup stage * Calls hci_init_sync to perform HCI command init sequence */ static int hci_dev_init_sync(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); atomic_set(&hdev->cmd_cnt, 1); set_bit(HCI_INIT, &hdev->flags); ret = hci_dev_setup_sync(hdev); if (hci_dev_test_flag(hdev, HCI_CONFIG)) { /* If public address change is configured, ensure that * the address gets programmed. If the driver does not * support changing the public address, fail the power * on procedure. */ if (bacmp(&hdev->public_addr, BDADDR_ANY) && hdev->set_bdaddr) ret = hdev->set_bdaddr(hdev, &hdev->public_addr); else ret = -EADDRNOTAVAIL; } if (!ret) { if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = hci_init_sync(hdev); if (!ret && hdev->post_init) ret = hdev->post_init(hdev); } } /* If the HCI Reset command is clearing all diagnostic settings, * then they need to be reprogrammed after the init procedure * completed. */ if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { msft_do_open(hdev); aosp_do_open(hdev); } clear_bit(HCI_INIT, &hdev->flags); return ret; } int hci_dev_open_sync(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { ret = -ENODEV; goto done; } if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { /* Check for rfkill but allow the HCI setup stage to * proceed (which in itself doesn't cause any RF activity). */ if (hci_dev_test_flag(hdev, HCI_RFKILLED)) { ret = -ERFKILL; goto done; } /* Check for valid public address or a configured static * random address, but let the HCI setup proceed to * be able to determine if there is a public address * or not. * * In case of user channel usage, it is not important * if a public address or static random address is * available. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY)) { ret = -EADDRNOTAVAIL; goto done; } } if (test_bit(HCI_UP, &hdev->flags)) { ret = -EALREADY; goto done; } if (hdev->open(hdev)) { ret = -EIO; goto done; } hci_devcd_reset(hdev); set_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_OPEN); ret = hci_dev_init_sync(hdev); if (!ret) { hci_dev_hold(hdev); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); hci_adv_instances_set_rpa_expired(hdev, true); set_bit(HCI_UP, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_UP); hci_leds_update_powered(hdev, true); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG) && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) { ret = hci_powered_update_sync(hdev); mgmt_power_on(hdev, ret); } } else { /* Init failed, cleanup */ flush_work(&hdev->tx_work); /* Since hci_rx_work() is possible to awake new cmd_work * it should be flushed first to avoid unexpected call of * hci_cmd_work() */ flush_work(&hdev->rx_work); flush_work(&hdev->cmd_work); skb_queue_purge(&hdev->cmd_q); skb_queue_purge(&hdev->rx_q); if (hdev->flush) hdev->flush(hdev); if (hdev->sent_cmd) { cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } if (hdev->req_skb) { kfree_skb(hdev->req_skb); hdev->req_skb = NULL; } clear_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_CLOSE); hdev->close(hdev); hdev->flags &= BIT(HCI_RAW); } done: return ret; } /* This function requires the caller holds hdev->lock */ static void hci_pend_le_actions_clear(struct hci_dev *hdev) { struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { hci_pend_le_list_del_init(p); if (p->conn) { hci_conn_drop(p->conn); hci_conn_put(p->conn); p->conn = NULL; } } BT_DBG("All LE pending actions cleared"); } static int hci_dev_shutdown(struct hci_dev *hdev) { int err = 0; /* Similar to how we first do setup and then set the exclusive access * bit for userspace, we must first unset userchannel and then clean up. * Otherwise, the kernel can't properly use the hci channel to clean up * the controller (some shutdown routines require sending additional * commands to the controller for example). */ bool was_userchannel = hci_dev_test_and_clear_flag(hdev, HCI_USER_CHANNEL); if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && test_bit(HCI_UP, &hdev->flags)) { /* Execute vendor specific shutdown routine */ if (hdev->shutdown) err = hdev->shutdown(hdev); } if (was_userchannel) hci_dev_set_flag(hdev, HCI_USER_CHANNEL); return err; } int hci_dev_close_sync(struct hci_dev *hdev) { bool auto_off; int err = 0; bt_dev_dbg(hdev, ""); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { disable_delayed_work(&hdev->power_off); disable_delayed_work(&hdev->ncmd_timer); disable_delayed_work(&hdev->le_scan_disable); } else { cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); cancel_delayed_work(&hdev->le_scan_disable); } hci_cmd_sync_cancel_sync(hdev, ENODEV); cancel_interleave_scan(hdev); if (hdev->adv_instance_timeout) { cancel_delayed_work_sync(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } err = hci_dev_shutdown(hdev); if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); return err; } hci_leds_update_powered(hdev, false); /* Flush RX and TX works */ flush_work(&hdev->tx_work); flush_work(&hdev->rx_work); if (hdev->discov_timeout > 0) { hdev->discov_timeout = 0; hci_dev_clear_flag(hdev, HCI_DISCOVERABLE); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); } if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) cancel_delayed_work(&hdev->service_cache); if (hci_dev_test_flag(hdev, HCI_MGMT)) { struct adv_info *adv_instance; cancel_delayed_work_sync(&hdev->rpa_expired); list_for_each_entry(adv_instance, &hdev->adv_instances, list) cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); } /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_discovery_set_state(hdev, DISCOVERY_STOPPED); auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); if (!auto_off && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); hci_inquiry_cache_flush(hdev); hci_pend_le_actions_clear(hdev); hci_conn_hash_flush(hdev); /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */ smp_unregister(hdev); hci_dev_unlock(hdev); hci_sock_dev_event(hdev, HCI_DEV_DOWN); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { aosp_do_close(hdev); msft_do_close(hdev); } if (hdev->flush) hdev->flush(hdev); /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); hci_reset_sync(hdev); clear_bit(HCI_INIT, &hdev->flags); } /* flush cmd work */ flush_work(&hdev->cmd_work); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); skb_queue_purge(&hdev->raw_q); /* Drop last sent command */ if (hdev->sent_cmd) { cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } /* Drop last request */ if (hdev->req_skb) { kfree_skb(hdev->req_skb); hdev->req_skb = NULL; } clear_bit(HCI_RUNNING, &hdev->flags); hci_sock_dev_event(hdev, HCI_DEV_CLOSE); /* After this point our queues are empty and no tasks are scheduled. */ hdev->close(hdev); /* Clear flags */ hdev->flags &= BIT(HCI_RAW); hci_dev_clear_volatile_flags(hdev); memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); hci_codec_list_clear(&hdev->local_codecs); hci_dev_put(hdev); return err; } /* This function perform power on HCI command sequence as follows: * * If controller is already up (HCI_UP) performs hci_powered_update_sync * sequence otherwise run hci_dev_open_sync which will follow with * hci_powered_update_sync after the init sequence is completed. */ static int hci_power_on_sync(struct hci_dev *hdev) { int err; if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); return hci_powered_update_sync(hdev); } err = hci_dev_open_sync(hdev); if (err < 0) return err; /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to return the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (!bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_close_sync(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } return 0; } static int hci_remote_name_cancel_sync(struct hci_dev *hdev, bdaddr_t *addr) { struct hci_cp_remote_name_req_cancel cp; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, addr); return __hci_cmd_sync_status(hdev, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_stop_discovery_sync(struct hci_dev *hdev) { struct discovery_state *d = &hdev->discovery; struct inquiry_entry *e; int err; bt_dev_dbg(hdev, "state %u", hdev->discovery.state); if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) { if (test_bit(HCI_INQUIRY, &hdev->flags)) { err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); if (err) return err; } if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); err = hci_scan_disable_sync(hdev); if (err) return err; } } else { err = hci_scan_disable_sync(hdev); if (err) return err; } /* Resume advertising if it was paused */ if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* No further actions needed for LE-only discovery */ if (d->type == DISCOV_TYPE_LE) return 0; if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) { e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_PENDING); if (!e) return 0; /* Ignore cancel errors since it should interfere with stopping * of the discovery. */ hci_remote_name_cancel_sync(hdev, &e->data.bdaddr); } return 0; } static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_disconnect cp; if (test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) { /* This is a BIS connection, hci_conn_del will * do the necessary cleanup. */ hci_dev_lock(hdev); hci_conn_failed(conn, reason); hci_dev_unlock(hdev); return 0; } memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; /* Wait for HCI_EV_DISCONN_COMPLETE, not HCI_EV_CMD_STATUS, when the * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is * used when suspending or powering off, where we don't want to wait * for the peer's response. */ if (reason != HCI_ERROR_REMOTE_POWER_OFF) return __hci_cmd_sync_status_sk(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp, HCI_EV_DISCONN_COMPLETE, HCI_CMD_TIMEOUT, NULL); return __hci_cmd_sync_status(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { /* Return reason if scanning since the connection shall probably be * cleanup directly. */ if (test_bit(HCI_CONN_SCANNING, &conn->flags)) return reason; if (conn->role == HCI_ROLE_SLAVE || test_and_set_bit(HCI_CONN_CANCEL, &conn->flags)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); } static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { if (conn->type == LE_LINK) return hci_le_connect_cancel_sync(hdev, conn, reason); if (conn->type == ISO_LINK) { /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 1857: * * If this command is issued for a CIS on the Central and the * CIS is successfully terminated before being established, * then an HCI_LE_CIS_Established event shall also be sent for * this CIS with the Status Operation Cancelled by Host (0x44). */ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) return hci_disconnect_sync(hdev, conn, reason); /* CIS with no Create CIS sent have nothing to cancel */ if (bacmp(&conn->dst, BDADDR_ANY)) return HCI_ERROR_LOCAL_HOST_TERM; /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ return 0; } if (hdev->hci_ver < BLUETOOTH_VER_1_2) return 0; /* Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is * used when suspending or powering off, where we don't want to wait * for the peer's response. */ if (reason != HCI_ERROR_REMOTE_POWER_OFF) return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_EV_CONN_COMPLETE, HCI_CMD_TIMEOUT, NULL); return __hci_cmd_sync_status(hdev, HCI_OP_CREATE_CONN_CANCEL, 6, &conn->dst, HCI_CMD_TIMEOUT); } static int hci_reject_sco_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_sync_conn_req cp; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.reason = reason; /* SCO rejection has its own limited set of * allowed error values (0x0D-0x0F). */ if (reason < 0x0d || reason > 0x0f) cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_le_reject_cis_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_le_reject_cis cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { struct hci_cp_reject_conn_req cp; if (conn->type == ISO_LINK) return hci_le_reject_cis_sync(hdev, conn, reason); if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.reason = reason; return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason) { int err = 0; u16 handle = conn->handle; bool disconnect = false; struct hci_conn *c; switch (conn->state) { case BT_CONNECTED: case BT_CONFIG: err = hci_disconnect_sync(hdev, conn, reason); break; case BT_CONNECT: err = hci_connect_cancel_sync(hdev, conn, reason); break; case BT_CONNECT2: err = hci_reject_conn_sync(hdev, conn, reason); break; case BT_OPEN: case BT_BOUND: break; default: disconnect = true; break; } hci_dev_lock(hdev); /* Check if the connection has been cleaned up concurrently */ c = hci_conn_hash_lookup_handle(hdev, handle); if (!c || c != conn) { err = 0; goto unlock; } /* Cleanup hci_conn object if it cannot be cancelled as it * likelly means the controller and host stack are out of sync * or in case of LE it was still scanning so it can be cleanup * safely. */ if (disconnect) { conn->state = BT_CLOSED; hci_disconn_cfm(conn, reason); hci_conn_del(conn); } else { hci_conn_failed(conn, reason); } unlock: hci_dev_unlock(hdev); return err; } static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason) { struct list_head *head = &hdev->conn_hash.list; struct hci_conn *conn; rcu_read_lock(); while ((conn = list_first_or_null_rcu(head, struct hci_conn, list))) { /* Make sure the connection is not freed while unlocking */ conn = hci_conn_get(conn); rcu_read_unlock(); /* Disregard possible errors since hci_conn_del shall have been * called even in case of errors had occurred since it would * then cause hci_conn_failed to be called which calls * hci_conn_del internally. */ hci_abort_conn_sync(hdev, conn, reason); hci_conn_put(conn); rcu_read_lock(); } rcu_read_unlock(); return 0; } /* This function perform power off HCI command sequence as follows: * * Clear Advertising * Stop Discovery * Disconnect all connections * hci_dev_close_sync */ static int hci_power_off_sync(struct hci_dev *hdev) { int err; /* If controller is already down there is nothing to do */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; hci_dev_set_flag(hdev, HCI_POWERING_DOWN); if (test_bit(HCI_ISCAN, &hdev->flags) || test_bit(HCI_PSCAN, &hdev->flags)) { err = hci_write_scan_enable_sync(hdev, 0x00); if (err) goto out; } err = hci_clear_adv_sync(hdev, NULL, false); if (err) goto out; err = hci_stop_discovery_sync(hdev); if (err) goto out; /* Terminated due to Power Off */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) goto out; err = hci_dev_close_sync(hdev); out: hci_dev_clear_flag(hdev, HCI_POWERING_DOWN); return err; } int hci_set_powered_sync(struct hci_dev *hdev, u8 val) { if (val) return hci_power_on_sync(hdev); return hci_power_off_sync(hdev); } static int hci_write_iac_sync(struct hci_dev *hdev) { struct hci_cp_write_current_iac_lap cp; if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) return 0; memset(&cp, 0, sizeof(cp)); if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { /* Limited discoverable mode */ cp.num_iac = min_t(u8, hdev->num_iac, 2); cp.iac_lap[0] = 0x00; /* LIAC */ cp.iac_lap[1] = 0x8b; cp.iac_lap[2] = 0x9e; cp.iac_lap[3] = 0x33; /* GIAC */ cp.iac_lap[4] = 0x8b; cp.iac_lap[5] = 0x9e; } else { /* General discoverable mode */ cp.num_iac = 1; cp.iac_lap[0] = 0x33; /* GIAC */ cp.iac_lap[1] = 0x8b; cp.iac_lap[2] = 0x9e; } return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CURRENT_IAC_LAP, (cp.num_iac * 3) + 1, &cp, HCI_CMD_TIMEOUT); } int hci_update_discoverable_sync(struct hci_dev *hdev) { int err = 0; if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = hci_write_iac_sync(hdev); if (err) return err; err = hci_update_scan_sync(hdev); if (err) return err; err = hci_update_class_sync(hdev); if (err) return err; } /* Advertising instances don't use the global discoverable setting, so * only update AD if advertising was enabled using Set Advertising. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) { err = hci_update_adv_data_sync(hdev, 0x00); if (err) return err; /* Discoverable mode affects the local advertising * address in limited privacy mode. */ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) { if (ext_adv_capable(hdev)) err = hci_start_ext_adv_sync(hdev, 0x00); else err = hci_enable_advertising_sync(hdev); } } return err; } static int update_discoverable_sync(struct hci_dev *hdev, void *data) { return hci_update_discoverable_sync(hdev); } int hci_update_discoverable(struct hci_dev *hdev) { /* Only queue if it would have any effect */ if (hdev_is_powered(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING) && hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) return hci_cmd_sync_queue(hdev, update_discoverable_sync, NULL, NULL); return 0; } int hci_update_connectable_sync(struct hci_dev *hdev) { int err; err = hci_update_scan_sync(hdev); if (err) return err; /* If BR/EDR is not enabled and we disable advertising as a * by-product of disabling connectable, we need to update the * advertising flags. */ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) err = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance); /* Update the advertising parameters if necessary */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || !list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) err = hci_start_ext_adv_sync(hdev, hdev->cur_adv_instance); else err = hci_enable_advertising_sync(hdev); if (err) return err; } return hci_update_passive_scan_sync(hdev); } int hci_inquiry_sync(struct hci_dev *hdev, u8 length, u8 num_rsp) { const u8 giac[3] = { 0x33, 0x8b, 0x9e }; const u8 liac[3] = { 0x00, 0x8b, 0x9e }; struct hci_cp_inquiry cp; bt_dev_dbg(hdev, ""); if (test_bit(HCI_INQUIRY, &hdev->flags)) return 0; hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_dev_unlock(hdev); memset(&cp, 0, sizeof(cp)); if (hdev->discovery.limited) memcpy(&cp.lap, liac, sizeof(cp.lap)); else memcpy(&cp.lap, giac, sizeof(cp.lap)); cp.length = length; cp.num_rsp = num_rsp; return __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) { u8 own_addr_type; /* Accept list is not used for discovery */ u8 filter_policy = 0x00; /* Default is to enable duplicates filter */ u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE; int err; bt_dev_dbg(hdev, ""); /* If controller is scanning, it means the passive scanning is * running. Thus, we should temporarily stop it in order to set the * discovery scanning parameters. */ err = hci_scan_disable_sync(hdev); if (err) { bt_dev_err(hdev, "Unable to disable scanning: %d", err); return err; } cancel_interleave_scan(hdev); /* Pause address resolution for active scan and stop advertising if * privacy is enabled. */ err = hci_pause_addr_resolution(hdev); if (err) goto failed; /* All active scans will be done with either a resolvable private * address (when privacy feature has been enabled) or non-resolvable * private address. */ err = hci_update_random_address_sync(hdev, true, scan_use_rpa(hdev), &own_addr_type); if (err < 0) own_addr_type = ADDR_LE_DEV_PUBLIC; if (hci_is_adv_monitoring(hdev) || (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && hdev->discovery.result_filtering)) { /* Duplicate filter should be disabled when some advertisement * monitor is activated, otherwise AdvMon can only receive one * advertisement for one peer(*) during active scanning, and * might report loss to these peers. * * If controller does strict duplicate filtering and the * discovery requires result filtering disables controller based * filtering since that can cause reports that would match the * host filter to not be reported. */ filter_dup = LE_SCAN_FILTER_DUP_DISABLE; } err = hci_start_scan_sync(hdev, LE_SCAN_ACTIVE, interval, hdev->le_scan_window_discovery, own_addr_type, filter_policy, filter_dup); if (!err) return err; failed: /* Resume advertising if it was paused */ if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* Resume passive scanning */ hci_update_passive_scan_sync(hdev); return err; } static int hci_start_interleaved_discovery_sync(struct hci_dev *hdev) { int err; bt_dev_dbg(hdev, ""); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery * 2); if (err) return err; return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0); } int hci_start_discovery_sync(struct hci_dev *hdev) { unsigned long timeout; int err; bt_dev_dbg(hdev, "type %u", hdev->discovery.type); switch (hdev->discovery.type) { case DISCOV_TYPE_BREDR: return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0); case DISCOV_TYPE_INTERLEAVED: /* When running simultaneous discovery, the LE scanning time * should occupy the whole discovery time sine BR/EDR inquiry * and LE scanning are scheduled by the controller. * * For interleaving discovery in comparison, BR/EDR inquiry * and LE scanning are done sequentially with separate * timeouts. */ if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); /* During simultaneous discovery, we double LE scan * interval. We must leave some time for the controller * to do BR/EDR inquiry. */ err = hci_start_interleaved_discovery_sync(hdev); break; } timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery); break; case DISCOV_TYPE_LE: timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery); break; default: return -EINVAL; } if (err) return err; bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout)); queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable, timeout); return 0; } static void hci_suspend_monitor_sync(struct hci_dev *hdev) { switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: msft_suspend_sync(hdev); break; default: return; } } /* This function disables discovery and mark it as paused */ static int hci_pause_discovery_sync(struct hci_dev *hdev) { int old_state = hdev->discovery.state; int err; /* If discovery already stopped/stopping/paused there nothing to do */ if (old_state == DISCOVERY_STOPPED || old_state == DISCOVERY_STOPPING || hdev->discovery_paused) return 0; hci_discovery_set_state(hdev, DISCOVERY_STOPPING); err = hci_stop_discovery_sync(hdev); if (err) return err; hdev->discovery_paused = true; hci_discovery_set_state(hdev, DISCOVERY_STOPPED); return 0; } static int hci_update_event_filter_sync(struct hci_dev *hdev) { struct bdaddr_list_with_flags *b; u8 scan = SCAN_DISABLED; bool scanning = test_bit(HCI_PSCAN, &hdev->flags); int err; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; /* Some fake CSR controllers lock up after setting this type of * filter, so avoid sending the request altogether. */ if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) return 0; /* Always clear event filter when starting */ hci_clear_event_filter_sync(hdev); list_for_each_entry(b, &hdev->accept_list, list) { if (!(b->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) continue; bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr); err = hci_set_event_filter_sync(hdev, HCI_FLT_CONN_SETUP, HCI_CONN_SETUP_ALLOW_BDADDR, &b->bdaddr, HCI_CONN_SETUP_AUTO_ON); if (err) bt_dev_dbg(hdev, "Failed to set event filter for %pMR", &b->bdaddr); else scan = SCAN_PAGE; } if (scan && !scanning) hci_write_scan_enable_sync(hdev, scan); else if (!scan && scanning) hci_write_scan_enable_sync(hdev, scan); return 0; } /* This function disables scan (BR and LE) and mark it as paused */ static int hci_pause_scan_sync(struct hci_dev *hdev) { if (hdev->scanning_paused) return 0; /* Disable page scan if enabled */ if (test_bit(HCI_PSCAN, &hdev->flags)) hci_write_scan_enable_sync(hdev, SCAN_DISABLED); hci_scan_disable_sync(hdev); hdev->scanning_paused = true; return 0; } /* This function performs the HCI suspend procedures in the follow order: * * Pause discovery (active scanning/inquiry) * Pause Directed Advertising/Advertising * Pause Scanning (passive scanning in case discovery was not active) * Disconnect all connections * Set suspend_status to BT_SUSPEND_DISCONNECT if hdev cannot wakeup * otherwise: * Update event mask (only set events that are allowed to wake up the host) * Update event filter (with devices marked with HCI_CONN_FLAG_REMOTE_WAKEUP) * Update passive scanning (lower duty cycle) * Set suspend_status to BT_SUSPEND_CONFIGURE_WAKE */ int hci_suspend_sync(struct hci_dev *hdev) { int err; /* If marked as suspended there nothing to do */ if (hdev->suspended) return 0; /* Mark device as suspended */ hdev->suspended = true; /* Pause discovery if not already stopped */ hci_pause_discovery_sync(hdev); /* Pause other advertisements */ hci_pause_advertising_sync(hdev); /* Suspend monitor filters */ hci_suspend_monitor_sync(hdev); /* Prevent disconnects from causing scanning to be re-enabled */ hci_pause_scan_sync(hdev); if (hci_conn_count(hdev)) { /* Soft disconnect everything (power off) */ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF); if (err) { /* Set state to BT_RUNNING so resume doesn't notify */ hdev->suspend_state = BT_RUNNING; hci_resume_sync(hdev); return err; } /* Update event mask so only the allowed event can wakeup the * host. */ hci_set_event_mask_sync(hdev); } /* Only configure accept list if disconnect succeeded and wake * isn't being prevented. */ if (!hdev->wakeup || !hdev->wakeup(hdev)) { hdev->suspend_state = BT_SUSPEND_DISCONNECT; return 0; } /* Unpause to take care of updating scanning params */ hdev->scanning_paused = false; /* Enable event filter for paired devices */ hci_update_event_filter_sync(hdev); /* Update LE passive scan if enabled */ hci_update_passive_scan_sync(hdev); /* Pause scan changes again. */ hdev->scanning_paused = true; hdev->suspend_state = BT_SUSPEND_CONFIGURE_WAKE; return 0; } /* This function resumes discovery */ static int hci_resume_discovery_sync(struct hci_dev *hdev) { int err; /* If discovery not paused there nothing to do */ if (!hdev->discovery_paused) return 0; hdev->discovery_paused = false; hci_discovery_set_state(hdev, DISCOVERY_STARTING); err = hci_start_discovery_sync(hdev); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED : DISCOVERY_FINDING); return err; } static void hci_resume_monitor_sync(struct hci_dev *hdev) { switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: msft_resume_sync(hdev); break; default: return; } } /* This function resume scan and reset paused flag */ static int hci_resume_scan_sync(struct hci_dev *hdev) { if (!hdev->scanning_paused) return 0; hdev->scanning_paused = false; hci_update_scan_sync(hdev); /* Reset passive scanning to normal */ hci_update_passive_scan_sync(hdev); return 0; } /* This function performs the HCI suspend procedures in the follow order: * * Restore event mask * Clear event filter * Update passive scanning (normal duty cycle) * Resume Directed Advertising/Advertising * Resume discovery (active scanning/inquiry) */ int hci_resume_sync(struct hci_dev *hdev) { /* If not marked as suspended there nothing to do */ if (!hdev->suspended) return 0; hdev->suspended = false; /* Restore event mask */ hci_set_event_mask_sync(hdev); /* Clear any event filters and restore scan state */ hci_clear_event_filter_sync(hdev); /* Resume scanning */ hci_resume_scan_sync(hdev); /* Resume monitor filters */ hci_resume_monitor_sync(hdev); /* Resume other advertisements */ hci_resume_advertising_sync(hdev); /* Resume discovery */ hci_resume_discovery_sync(hdev); return 0; } static bool conn_use_rpa(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; return hci_dev_test_flag(hdev, HCI_PRIVACY); } static int hci_le_ext_directed_advertising_sync(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_cp_le_set_ext_adv_params cp; int err; bdaddr_t random_addr; u8 own_addr_type; err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (err) return err; /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ err = hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL, &own_addr_type, &random_addr); if (err) return err; memset(&cp, 0, sizeof(cp)); cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND); cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = HCI_TX_POWER_INVALID; cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_1M; cp.handle = 0x00; /* Use instance 0 for directed adv */ cp.own_addr_type = own_addr_type; cp.peer_addr_type = conn->dst_type; bacpy(&cp.peer_addr, &conn->dst); /* As per Core Spec 5.2 Vol 2, PART E, Sec 7.8.53, for * advertising_event_property LE_LEGACY_ADV_DIRECT_IND * does not supports advertising data when the advertising set already * contains some, the controller shall return erroc code 'Invalid * HCI Command Parameters(0x12). * So it is required to remove adv set for handle 0x00. since we use * instance 0 for directed adv. */ err = hci_remove_ext_adv_instance_sync(hdev, cp.handle, NULL); if (err) return err; err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (err) return err; /* Check if random address need to be updated */ if (own_addr_type == ADDR_LE_DEV_RANDOM && bacmp(&random_addr, BDADDR_ANY) && bacmp(&random_addr, &hdev->random_addr)) { err = hci_set_adv_set_random_addr_sync(hdev, 0x00, &random_addr); if (err) return err; } return hci_enable_ext_advertising_sync(hdev, 0x00); } static int hci_le_directed_advertising_sync(struct hci_dev *hdev, struct hci_conn *conn) { struct hci_cp_le_set_adv_param cp; u8 status; u8 own_addr_type; u8 enable; if (ext_adv_capable(hdev)) return hci_le_ext_directed_advertising_sync(hdev, conn); /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ status = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (status) return status; memset(&cp, 0, sizeof(cp)); /* Some controllers might reject command if intervals are not * within range for undirected advertising. * BCM20702A0 is known to be affected by this. */ cp.min_interval = cpu_to_le16(0x0020); cp.max_interval = cpu_to_le16(0x0020); cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; bacpy(&cp.direct_addr, &conn->dst); cp.channel_map = hdev->le_adv_channel_map; status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (status) return status; enable = 0x01; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); } static void set_ext_conn_params(struct hci_conn *conn, struct hci_cp_le_ext_conn_param *p) { struct hci_dev *hdev = conn->hdev; memset(p, 0, sizeof(*p)); p->scan_interval = cpu_to_le16(hdev->le_scan_int_connect); p->scan_window = cpu_to_le16(hdev->le_scan_window_connect); p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); p->conn_latency = cpu_to_le16(conn->le_conn_latency); p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); p->min_ce_len = cpu_to_le16(0x0000); p->max_ce_len = cpu_to_le16(0x0000); } static int hci_le_ext_create_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 own_addr_type) { struct hci_cp_le_ext_create_conn *cp; struct hci_cp_le_ext_conn_param *p; u8 data[sizeof(*cp) + sizeof(*p) * 3]; u32 plen; cp = (void *)data; p = (void *)cp->data; memset(cp, 0, sizeof(*cp)); bacpy(&cp->peer_addr, &conn->dst); cp->peer_addr_type = conn->dst_type; cp->own_addr_type = own_addr_type; plen = sizeof(*cp); if (scan_1m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_1M || conn->le_adv_sec_phy == HCI_ADV_PHY_1M)) { cp->phys |= LE_SCAN_PHY_1M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_2m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_2M || conn->le_adv_sec_phy == HCI_ADV_PHY_2M)) { cp->phys |= LE_SCAN_PHY_2M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_coded(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_CODED || conn->le_adv_sec_phy == HCI_ADV_PHY_CODED)) { cp->phys |= LE_SCAN_PHY_CODED; set_ext_conn_params(conn, p); plen += sizeof(*p); } return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN, plen, data, HCI_EV_LE_ENHANCED_CONN_COMPLETE, conn->conn_timeout, NULL); } static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_create_conn cp; struct hci_conn_params *params; u8 own_addr_type; int err; struct hci_conn *conn = data; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; bt_dev_dbg(hdev, "conn %p", conn); clear_bit(HCI_CONN_SCANNING, &conn->flags); conn->state = BT_CONNECT; /* If requested to connect as peripheral use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { /* If we're active scanning and simultaneous roles is not * enabled simply reject the attempt. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE && !hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) { hci_conn_del(conn); return -EBUSY; } /* Pause advertising while doing directed advertising. */ hci_pause_advertising_sync(hdev); err = hci_le_directed_advertising_sync(hdev, conn); goto done; } /* Disable advertising if simultaneous roles is not in use. */ if (!hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) hci_pause_advertising_sync(hdev); params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { conn->le_conn_min_interval = params->conn_min_interval; conn->le_conn_max_interval = params->conn_max_interval; conn->le_conn_latency = params->conn_latency; conn->le_supv_timeout = params->supervision_timeout; } else { conn->le_conn_min_interval = hdev->le_conn_min_interval; conn->le_conn_max_interval = hdev->le_conn_max_interval; conn->le_conn_latency = hdev->le_conn_latency; conn->le_supv_timeout = hdev->le_supv_timeout; } /* If controller is scanning, we stop it since some controllers are * not able to scan and connect at the same time. Also set the * HCI_LE_SCAN_INTERRUPTED flag so that the command complete * handler for scan disabling knows to set the correct discovery * state. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_scan_disable_sync(hdev); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } /* Update random address, but set require_privacy to false so * that we never connect with an non-resolvable address. */ err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn), &own_addr_type); if (err) goto done; /* Send command LE Extended Create Connection if supported */ if (use_ext_conn(hdev)) { err = hci_le_ext_create_conn_sync(hdev, conn, own_addr_type); goto done; } memset(&cp, 0, sizeof(cp)); cp.scan_interval = cpu_to_le16(hdev->le_scan_int_connect); cp.scan_window = cpu_to_le16(hdev->le_scan_window_connect); bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; cp.own_address_type = own_addr_type; cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); cp.conn_latency = cpu_to_le16(conn->le_conn_latency); cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261: * * If this event is unmasked and the HCI_LE_Connection_Complete event * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is * sent when a new connection has been created. */ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp, use_enhanced_conn_complete(hdev) ? HCI_EV_LE_ENHANCED_CONN_COMPLETE : HCI_EV_LE_CONN_COMPLETE, conn->conn_timeout, NULL); done: if (err == -ETIMEDOUT) hci_le_connect_cancel_sync(hdev, conn, 0x00); /* Re-enable advertising after the connection attempt is finished. */ hci_resume_advertising_sync(hdev); return err; } int hci_le_create_cis_sync(struct hci_dev *hdev) { DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f); size_t aux_num_cis = 0; struct hci_conn *conn; u8 cig = BT_ISO_QOS_CIG_UNSET; /* The spec allows only one pending LE Create CIS command at a time. If * the command is pending now, don't do anything. We check for pending * connections after each CIS Established event. * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2566: * * If the Host issues this command before all the * HCI_LE_CIS_Established events from the previous use of the * command have been generated, the Controller shall return the * error code Command Disallowed (0x0C). * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2567: * * When the Controller receives the HCI_LE_Create_CIS command, the * Controller sends the HCI_Command_Status event to the Host. An * HCI_LE_CIS_Established event will be generated for each CIS when it * is established or if it is disconnected or considered lost before * being established; until all the events are generated, the command * remains pending. */ hci_dev_lock(hdev); rcu_read_lock(); /* Wait until previous Create CIS has completed */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) goto done; } /* Find CIG with all CIS ready */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { struct hci_conn *link; if (hci_conn_check_create_cis(conn)) continue; cig = conn->iso_qos.ucast.cig; list_for_each_entry_rcu(link, &hdev->conn_hash.list, list) { if (hci_conn_check_create_cis(link) > 0 && link->iso_qos.ucast.cig == cig && link->state != BT_CONNECTED) { cig = BT_ISO_QOS_CIG_UNSET; break; } } if (cig != BT_ISO_QOS_CIG_UNSET) break; } if (cig == BT_ISO_QOS_CIG_UNSET) goto done; list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { struct hci_cis *cis = &cmd->cis[aux_num_cis]; if (hci_conn_check_create_cis(conn) || conn->iso_qos.ucast.cig != cig) continue; set_bit(HCI_CONN_CREATE_CIS, &conn->flags); cis->acl_handle = cpu_to_le16(conn->parent->handle); cis->cis_handle = cpu_to_le16(conn->handle); aux_num_cis++; if (aux_num_cis >= cmd->num_cis) break; } cmd->num_cis = aux_num_cis; done: rcu_read_unlock(); hci_dev_unlock(hdev); if (!aux_num_cis) return 0; /* Wait for HCI_LE_CIS_Established */ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS, struct_size(cmd, cis, cmd->num_cis), cmd, HCI_EVT_LE_CIS_ESTABLISHED, conn->conn_timeout, NULL); } int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle) { struct hci_cp_le_remove_cig cp; memset(&cp, 0, sizeof(cp)); cp.cig_id = handle; return __hci_cmd_sync_status(hdev, HCI_OP_LE_REMOVE_CIG, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle) { struct hci_cp_le_big_term_sync cp; memset(&cp, 0, sizeof(cp)); cp.handle = handle; return __hci_cmd_sync_status(hdev, HCI_OP_LE_BIG_TERM_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle) { struct hci_cp_le_pa_term_sync cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(handle); return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bool use_rpa, struct adv_info *adv_instance, u8 *own_addr_type, bdaddr_t *rand_addr) { int err; bacpy(rand_addr, BDADDR_ANY); /* If privacy is enabled use a resolvable private address. If * current RPA has expired then generate a new one. */ if (use_rpa) { /* If Controller supports LL Privacy use own address type is * 0x03 */ if (ll_privacy_capable(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; if (adv_instance) { if (adv_rpa_valid(adv_instance)) return 0; } else { if (rpa_valid(hdev)) return 0; } err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); if (err < 0) { bt_dev_err(hdev, "failed to generate new RPA"); return err; } bacpy(rand_addr, &hdev->rpa); return 0; } /* In case of required privacy without resolvable private address, * use an non-resolvable private address. This is useful for * non-connectable advertising. */ if (require_privacy) { bdaddr_t nrpa; while (true) { /* The non-resolvable private address is generated * from random six bytes with the two most significant * bits cleared. */ get_random_bytes(&nrpa, 6); nrpa.b[5] &= 0x3f; /* The non-resolvable private address shall not be * equal to the public address. */ if (bacmp(&hdev->bdaddr, &nrpa)) break; } *own_addr_type = ADDR_LE_DEV_RANDOM; bacpy(rand_addr, &nrpa); return 0; } /* No privacy so use a public address. */ *own_addr_type = ADDR_LE_DEV_PUBLIC; return 0; } static int _update_adv_data_sync(struct hci_dev *hdev, void *data) { u8 instance = PTR_UINT(data); return hci_update_adv_data_sync(hdev, instance); } int hci_update_adv_data(struct hci_dev *hdev, u8 instance) { return hci_cmd_sync_queue(hdev, _update_adv_data_sync, UINT_PTR(instance), NULL); } static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; struct inquiry_entry *ie; struct hci_cp_create_conn cp; int err; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; /* Many controllers disallow HCI Create Connection while it is doing * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create * Connection. This may cause the MGMT discovering state to become false * without user space's request but it is okay since the MGMT Discovery * APIs do not promise that discovery should be done forever. Instead, * the user space monitors the status of MGMT discovering and it may * request for discovery again when this flag becomes false. */ if (test_bit(HCI_INQUIRY, &hdev->flags)) { err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); if (err) bt_dev_warn(hdev, "Failed to cancel inquiry %d", err); } conn->state = BT_CONNECT; conn->out = true; conn->role = HCI_ROLE_MASTER; conn->attempt++; conn->link_policy = hdev->link_policy; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.pscan_rep_mode = 0x02; ie = hci_inquiry_cache_lookup(hdev, &conn->dst); if (ie) { if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { cp.pscan_rep_mode = ie->data.pscan_rep_mode; cp.pscan_mode = ie->data.pscan_mode; cp.clock_offset = ie->data.clock_offset | cpu_to_le16(0x8000); } memcpy(conn->dev_class, ie->data.dev_class, 3); } cp.pkt_type = cpu_to_le16(conn->pkt_type); if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) cp.role_switch = 0x01; else cp.role_switch = 0x00; return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp, HCI_EV_CONN_COMPLETE, conn->conn_timeout, NULL); } int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn, NULL); } static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; bt_dev_dbg(hdev, "err %d", err); if (err == -ECANCELED) return; hci_dev_lock(hdev); if (!hci_conn_valid(hdev, conn)) goto done; if (!err) { hci_connect_le_scan_cleanup(conn, 0x00); goto done; } /* Check if connection is still pending */ if (conn != hci_lookup_le_connect(hdev)) goto done; /* Flush to make sure we send create conn cancel command if needed */ flush_delayed_work(&conn->le_conn_timeout); hci_conn_failed(conn, bt_status(err)); done: hci_dev_unlock(hdev); } int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn) { return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn, create_le_conn_complete); } int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn) { if (conn->state != BT_OPEN) return -EINVAL; switch (conn->type) { case ACL_LINK: return !hci_cmd_sync_dequeue_once(hdev, hci_acl_create_conn_sync, conn, NULL); case LE_LINK: return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync, conn, create_le_conn_complete); } return -ENOENT; } int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, struct hci_conn_params *params) { struct hci_cp_le_conn_update cp; memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.conn_interval_min = cpu_to_le16(params->conn_min_interval); cp.conn_interval_max = cpu_to_le16(params->conn_max_interval); cp.conn_latency = cpu_to_le16(params->conn_latency); cp.supervision_timeout = cpu_to_le16(params->supervision_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); return __hci_cmd_sync_status(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); }
70 70 72 73 7 7 258 259 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 // SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju * Jim Keniston * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> #include <linux/highmem.h> #include <linux/pagemap.h> /* read_mapping_page */ #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> #include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include <linux/percpu-rwsem.h> #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> #include <linux/rcupdate_trace.h> #include <linux/workqueue.h> #include <linux/srcu.h> #include <linux/oom.h> /* check_stable_address_space */ #include <linux/uprobes.h> #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES) #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE static struct rb_root uprobes_tree = RB_ROOT; /* * allows us to skip the uprobe_mmap if there are no uprobe events active * at this time. Probably a fine grained per inode count is better? */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Covers return_instance's uprobe lifetime. */ DEFINE_STATIC_SRCU(uretprobes_srcu); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ union { struct rcu_head rcu; struct work_struct work; }; loff_t offset; loff_t ref_ctr_offset; unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of * line, copied to xol_area by xol_get_insn_slot(). */ struct arch_uprobe arch; }; struct delayed_uprobe { struct list_head list; struct uprobe *uprobe; struct mm_struct *mm; }; static DEFINE_MUTEX(delayed_uprobe_lock); static LIST_HEAD(delayed_uprobe_list); /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction * mangled by set_swbp(). * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated. */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ unsigned long vaddr; /* Page(s) of instruction slots */ }; static void uprobe_warn(struct task_struct *t, const char *msg) { pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg); } /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. * - is_register: indicates if we are in register context. * - Return 1 if the specified virtual address is in an * executable vma. */ static bool valid_vma(struct vm_area_struct *vma, bool is_register) { vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE; if (is_register) flags |= VM_WRITE; return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC; } static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); } static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) { return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); } /** * __replace_page - replace page in vma by new page. * based on replace_page in mm/ksm.c * * @vma: vma that holds the pte pointing to page * @addr: address the old @page is mapped at * @old_page: the page we are replacing by new_page * @new_page: the modified page we replace page by * * If @new_page is NULL, only unmap @old_page. * * Returns 0 on success, negative error code otherwise. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct folio *old_folio = page_folio(old_page); struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; pte_t pte; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, addr + PAGE_SIZE); if (new_page) { new_folio = page_folio(new_page); err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } /* For folio_free_swap() below */ folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; if (!page_vma_mapped_walk(&pvmw)) goto unlock; VM_BUG_ON_PAGE(addr != pvmw.address, old_page); pte = ptep_get(pvmw.pte); /* * Handle PFN swap PTES, such as device-exclusive ones, that actually * map pages: simply trigger GUP again to fix it up. */ if (unlikely(!pte_present(pte))) { page_vma_mapped_walk_done(&pvmw); goto unlock; } if (new_page) { folio_get(new_folio); folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE); folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_folio)); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(pte)); ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); folio_remove_rmap_pte(old_folio, old_page, vma); if (!folio_mapped(old_folio)) folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); folio_unlock(old_folio); return err; } /** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn * Returns true if @insn is a breakpoint instruction. */ bool __weak is_swbp_insn(uprobe_opcode_t *insn) { return *insn == UPROBE_SWBP_INSN; } /** * is_trap_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_trap_insn * Returns true if @insn is a breakpoint instruction. * * This function is needed for the case where an architecture has multiple * trap instructions (like powerpc). */ bool __weak is_trap_insn(uprobe_opcode_t *insn) { return is_swbp_insn(insn); } static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); kunmap_atomic(kaddr); } static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) { void *kaddr = kmap_atomic(page); memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); kunmap_atomic(kaddr); } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) { uprobe_opcode_t old_opcode; bool is_swbp; /* * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. * We do not check if it is any other 'trap variant' which could * be conditional trap instruction such as the one powerpc supports. * * The logic is that we do not care if the underlying instruction * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { if (is_swbp) /* register: already installed? */ return 0; } else { if (!is_swbp) /* unregister: was it changed by us? */ return 0; } return 1; } static struct delayed_uprobe * delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; list_for_each_entry(du, &delayed_uprobe_list, list) if (du->uprobe == uprobe && du->mm == mm) return du; return NULL; } static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm) { struct delayed_uprobe *du; if (delayed_uprobe_check(uprobe, mm)) return 0; du = kzalloc(sizeof(*du), GFP_KERNEL); if (!du) return -ENOMEM; du->uprobe = uprobe; du->mm = mm; list_add(&du->list, &delayed_uprobe_list); return 0; } static void delayed_uprobe_delete(struct delayed_uprobe *du) { if (WARN_ON(!du)) return; list_del(&du->list); kfree(du); } static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm) { struct list_head *pos, *q; struct delayed_uprobe *du; if (!uprobe && !mm) return; list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (uprobe && du->uprobe != uprobe) continue; if (mm && du->mm != mm) continue; delayed_uprobe_delete(du); } } static bool valid_ref_ctr_vma(struct uprobe *uprobe, struct vm_area_struct *vma) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset); return uprobe->ref_ctr_offset && vma->vm_file && file_inode(vma->vm_file) == uprobe->inode && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && vma->vm_start <= vaddr && vma->vm_end > vaddr; } static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; return NULL; } static int __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d) { void *kaddr; struct page *page; int ret; short *ptr; if (!vaddr || !d) return -EINVAL; ret = get_user_pages_remote(mm, vaddr, 1, FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, * it may return 0, in that case we have to return error. */ return ret == 0 ? -EBUSY : ret; } kaddr = kmap_atomic(page); ptr = kaddr + (vaddr & ~PAGE_MASK); if (unlikely(*ptr + d < 0)) { pr_warn("ref_ctr going negative. vaddr: 0x%lx, " "curr val: %d, delta: %d\n", vaddr, *ptr, d); ret = -EINVAL; goto out; } *ptr += d; ret = 0; out: kunmap_atomic(kaddr); put_page(page); return ret; } static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) uprobe->ref_ctr_offset, mm); } static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, short d) { struct vm_area_struct *rc_vma; unsigned long rc_vaddr; int ret = 0; rc_vma = find_ref_ctr_vma(uprobe, mm); if (rc_vma) { rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset); ret = __update_ref_ctr(mm, rc_vaddr, d); if (ret) update_ref_ctr_warn(uprobe, mm, d); if (d > 0) return ret; } mutex_lock(&delayed_uprobe_lock); if (d > 0) ret = delayed_uprobe_add(uprobe, mm); else delayed_uprobe_remove(uprobe, mm); mutex_unlock(&delayed_uprobe_lock); return ret; } /* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. * @mm: the probed process address space. * @vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct uprobe *uprobe; struct page *old_page, *new_page; struct vm_area_struct *vma; int ret, is_register, ref_ctr_updated = 0; bool orig_page_huge = false; unsigned int gup_flags = FOLL_FORCE; is_register = is_swbp_insn(&opcode); uprobe = container_of(auprobe, struct uprobe, arch); retry: if (is_register) gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); if (IS_ERR(old_page)) return PTR_ERR(old_page); ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) goto put_old; if (is_zero_page(old_page)) { ret = -EINVAL; goto put_old; } if (WARN(!is_register && PageCompound(old_page), "uprobe unregister should never work on compound page\n")) { ret = -EINVAL; goto put_old; } /* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) goto put_old; ref_ctr_updated = 1; } ret = 0; if (!is_register && !PageAnon(old_page)) goto put_old; ret = anon_vma_prepare(vma); if (ret) goto put_old; ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) goto put_old; __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); if (!is_register) { struct page *orig_page; pgoff_t index; VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, index); if (orig_page) { if (PageUptodate(orig_page) && pages_identical(new_page, orig_page)) { /* let go new_page */ put_page(new_page); new_page = NULL; if (PageCompound(orig_page)) orig_page_huge = true; } put_page(orig_page); } } ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page); if (new_page) put_page(new_page); put_old: put_page(old_page); if (unlikely(ret == -EAGAIN)) goto retry; /* Revert back reference counter if instruction update failed. */ if (ret && is_register && ref_ctr_updated) update_ref_ctr(uprobe, mm, -1); /* try collapse pmd for compound page */ if (!ret && orig_page_huge) collapse_pte_mapped_thp(mm, vaddr, false); return ret; } /** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @mm: the probed process address space. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); } /** * set_orig_insn - Restore the original instruction. * @mm: the probed process address space. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { return uprobe_write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } /* uprobe should have guaranteed positive refcount */ static struct uprobe *get_uprobe(struct uprobe *uprobe) { refcount_inc(&uprobe->ref); return uprobe; } /* * uprobe should have guaranteed lifetime, which can be either of: * - caller already has refcount taken (and wants an extra one); * - uprobe is RCU protected and won't be freed until after grace period; * - we are holding uprobes_treelock (for read or write, doesn't matter). */ static struct uprobe *try_get_uprobe(struct uprobe *uprobe) { if (refcount_inc_not_zero(&uprobe->ref)) return uprobe; return NULL; } static inline bool uprobe_is_active(struct uprobe *uprobe) { return !RB_EMPTY_NODE(&uprobe->rb_node); } static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } static void uprobe_free_srcu(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); } static void uprobe_free_deferred(struct work_struct *work) { struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); if (uprobe_is_active(uprobe)) { write_seqcount_begin(&uprobes_seqcount); rb_erase(&uprobe->rb_node, &uprobes_tree); write_seqcount_end(&uprobes_seqcount); } write_unlock(&uprobes_treelock); /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here. */ mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); /* start srcu -> rcu_tasks_trace -> kfree chain */ call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); } static void put_uprobe(struct uprobe *uprobe) { if (!refcount_dec_and_test(&uprobe->ref)) return; INIT_WORK(&uprobe->work, uprobe_free_deferred); schedule_work(&uprobe->work); } /* Initialize hprobe as SRCU-protected "leased" uprobe */ static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) { WARN_ON(!uprobe); hprobe->state = HPROBE_LEASED; hprobe->uprobe = uprobe; hprobe->srcu_idx = srcu_idx; } /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) { hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; hprobe->uprobe = uprobe; hprobe->srcu_idx = -1; } /* * hprobe_consume() fetches hprobe's underlying uprobe and detects whether * uprobe is SRCU protected or is refcounted. hprobe_consume() can be * used only once for a given hprobe. * * Caller has to call hprobe_finalize() and pass previous hprobe_state, so * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever * is appropriate. */ static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) { *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); switch (*hstate) { case HPROBE_LEASED: case HPROBE_STABLE: return hprobe->uprobe; case HPROBE_GONE: /* uprobe is NULL, no SRCU */ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ return NULL; default: WARN(1, "hprobe invalid state %d", *hstate); return NULL; } } /* * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. * hprobe_finalize() can only be used from current context after * hprobe_consume() call (which determines uprobe and hstate value). */ static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) { switch (hstate) { case HPROBE_LEASED: __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); break; case HPROBE_STABLE: put_uprobe(hprobe->uprobe); break; case HPROBE_GONE: case HPROBE_CONSUMED: break; default: WARN(1, "hprobe invalid state %d", hstate); break; } } /* * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of * them can win the race to perform SRCU unlocking. Whoever wins must perform * SRCU unlock. * * Returns underlying valid uprobe or NULL, if there was no underlying uprobe * to begin with or we failed to bump its refcount and it's going away. * * Returned non-NULL uprobe can be still safely used within an ongoing SRCU * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has * an extra refcount for caller to assume and use. Otherwise, it's not * guaranteed that returned uprobe has a positive refcount, so caller has to * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current * SRCU lock region. See dup_utask(). */ static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) { enum hprobe_state hstate; /* * Caller should guarantee that return_instance is not going to be * freed from under us. This can be achieved either through holding * rcu_read_lock() or by owning return_instance in the first place. * * Underlying uprobe is itself protected from reuse by SRCU, so ensure * SRCU lock is held properly. */ lockdep_assert(srcu_read_lock_held(&uretprobes_srcu)); hstate = READ_ONCE(hprobe->state); switch (hstate) { case HPROBE_STABLE: /* uprobe has positive refcount, bump refcount, if necessary */ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; case HPROBE_GONE: /* * SRCU was unlocked earlier and we didn't manage to take * uprobe refcnt, so it's effectively NULL */ return NULL; case HPROBE_CONSUMED: /* * uprobe was consumed, so it's effectively NULL as far as * uretprobe processing logic is concerned */ return NULL; case HPROBE_LEASED: { struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); /* * Try to switch hprobe state, guarding against * hprobe_consume() or another hprobe_expire() racing with us. * Note, if we failed to get uprobe refcount, we use special * HPROBE_GONE state to signal that hprobe->uprobe shouldn't * be used as it will be freed after SRCU is unlocked. */ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { /* We won the race, we are the ones to unlock SRCU */ __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); return get ? get_uprobe(uprobe) : uprobe; } /* * We lost the race, undo refcount bump (if it ever happened), * unless caller would like an extra refcount anyways. */ if (uprobe && !get) put_uprobe(uprobe); /* * Even if hprobe_consume() or another hprobe_expire() wins * the state update race and unlocks SRCU from under us, we * still have a guarantee that underyling uprobe won't be * freed due to ongoing caller's SRCU lock region, so we can * return it regardless. Also, if `get` was true, we also have * an extra ref for the caller to own. This is used in dup_utask(). */ return uprobe; } default: WARN(1, "unknown hprobe state %d", hstate); return NULL; } } static __always_inline int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, const struct uprobe *r) { if (l_inode < r->inode) return -1; if (l_inode > r->inode) return 1; if (l_offset < r->offset) return -1; if (l_offset > r->offset) return 1; return 0; } #define __node_2_uprobe(node) \ rb_entry((node), struct uprobe, rb_node) struct __uprobe_key { struct inode *inode; loff_t offset; }; static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) { const struct __uprobe_key *a = key; return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); } static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) { struct uprobe *u = __node_2_uprobe(a); return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); } /* * Assumes being inside RCU protected region. * No refcount is taken on returned uprobe. */ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) { struct __uprobe_key key = { .inode = inode, .offset = offset, }; struct rb_node *node; unsigned int seq; lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); /* * Lockless RB-tree lookups can result only in false negatives. * If the element is found, it is correct and can be returned * under RCU protection. If we find nothing, we need to * validate that seqcount didn't change. If it did, we have to * try again as we might have missed the element (false * negative). If seqcount is unchanged, search truly failed. */ if (node) return __node_2_uprobe(node); } while (read_seqcount_retry(&uprobes_seqcount, seq)); return NULL; } /* * Attempt to insert a new uprobe into uprobes_tree. * * If uprobe already exists (for given inode+offset), we just increment * refcount of previously existing uprobe. * * If not, a provided new instance of uprobe is inserted into the tree (with * assumed initial refcount == 1). * * In any case, we return a uprobe instance that ends up being in uprobes_tree. * Caller has to clean up new uprobe instance, if it ended up not being * inserted into the tree. * * We assume that uprobes_treelock is held for writing. */ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { struct rb_node *node; again: node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); if (node) { struct uprobe *u = __node_2_uprobe(node); if (!try_get_uprobe(u)) { rb_erase(node, &uprobes_tree); RB_CLEAR_NODE(&u->rb_node); goto again; } return u; } return uprobe; } /* * Acquire uprobes_treelock and insert uprobe into uprobes_tree * (or reuse existing one, see __insert_uprobe() comments above). */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; write_lock(&uprobes_treelock); write_seqcount_begin(&uprobes_seqcount); u = __insert_uprobe(uprobe); write_seqcount_end(&uprobes_seqcount); write_unlock(&uprobes_treelock); return u; } static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, (unsigned long long) uprobe->ref_ctr_offset); } static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, loff_t ref_ctr_offset) { struct uprobe *uprobe, *cur_uprobe; uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL); if (!uprobe) return ERR_PTR(-ENOMEM); uprobe->inode = inode; uprobe->offset = offset; uprobe->ref_ctr_offset = ref_ctr_offset; INIT_LIST_HEAD(&uprobe->consumers); init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); RB_CLEAR_NODE(&uprobe->rb_node); refcount_set(&uprobe->ref, 1); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); /* a uprobe exists for this inode:offset combination */ if (cur_uprobe != uprobe) { if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) { ref_ctr_mismatch_warn(cur_uprobe, uprobe); put_uprobe(cur_uprobe); kfree(uprobe); return ERR_PTR(-EINVAL); } kfree(uprobe); uprobe = cur_uprobe; } return uprobe; } static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { static atomic64_t id; down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } /* * For uprobe @uprobe, delete the consumer @uc. * Should never be called with consumer that's not part of @uprobe->consumers. */ static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc) { down_write(&uprobe->consumer_rwsem); list_del_rcu(&uc->cons_node); up_write(&uprobe->consumer_rwsem); } static int __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset) { struct page *page; /* * Ensure that the page that has the original instruction is populated * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register(). */ if (mapping->a_ops->read_folio) page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; } static int copy_insn(struct uprobe *uprobe, struct file *filp) { struct address_space *mapping = uprobe->inode->i_mapping; loff_t offs = uprobe->offset; void *insn = &uprobe->arch.insn; int size = sizeof(uprobe->arch.insn); int len, err = -EIO; /* Copy only available bytes, -EIO if nothing was read */ do { if (offs >= i_size_read(uprobe->inode)) break; len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); err = __copy_insn(mapping, filp, insn, len, offs); if (err) break; insn += len; offs += len; size -= len; } while (size); return err; } static int prepare_uprobe(struct uprobe *uprobe, struct file *file, struct mm_struct *mm, unsigned long vaddr) { int ret = 0; if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret; /* TODO: move this into _register, until then we abuse this sem. */ down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out; ret = copy_insn(uprobe, file); if (ret) goto out; ret = -ENOTSUPP; if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out; smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */ set_bit(UPROBE_COPY_INSN, &uprobe->flags); out: up_write(&uprobe->consumer_rwsem); return ret; } static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm) { return !uc->filter || uc->filter(uc, mm); } static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) { struct uprobe_consumer *uc; bool ret = false; down_read(&uprobe->consumer_rwsem); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; } up_read(&uprobe->consumer_rwsem); return ret; } static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) { bool first_uprobe; int ret; ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret; /* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); if (first_uprobe) set_bit(MMF_HAS_UPROBES, &mm->flags); ret = set_swbp(&uprobe->arch, mm, vaddr); if (!ret) clear_bit(MMF_RECALC_UPROBES, &mm->flags); else if (first_uprobe) clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } static int remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { set_bit(MMF_RECALC_UPROBES, &mm->flags); return set_orig_insn(&uprobe->arch, mm, vaddr); } struct map_info { struct map_info *next; struct mm_struct *mm; unsigned long vaddr; }; static inline struct map_info *free_map_info(struct map_info *info) { struct map_info *next = info->next; kfree(info); return next; } static struct map_info * build_map_info(struct address_space *mapping, loff_t offset, bool is_register) { unsigned long pgoff = offset >> PAGE_SHIFT; struct vm_area_struct *vma; struct map_info *curr = NULL; struct map_info *prev = NULL; struct map_info *info; int more = 0; again: i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (prev) prev->next = NULL; } if (!prev) { more++; continue; } if (!mmget_not_zero(vma->vm_mm)) continue; info = prev; prev = prev->next; info->next = curr; curr = info; info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } i_mmap_unlock_read(mapping); if (!more) goto out; prev = curr; while (curr) { mmput(curr->mm); curr = curr->next; } do { info = kmalloc(sizeof(struct map_info), GFP_KERNEL); if (!info) { curr = ERR_PTR(-ENOMEM); goto out; } info->next = prev; prev = info; } while (--more); goto again; out: while (prev) prev = free_map_info(prev); return curr; } static int register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) { bool is_register = !!new; struct map_info *info; int err = 0; percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); if (IS_ERR(info)) { err = PTR_ERR(info); goto out; } while (info) { struct mm_struct *mm = info->mm; struct vm_area_struct *vma; if (err && is_register) goto free; /* * We take mmap_lock for writing to avoid the race with * find_active_uprobe_rcu() which takes mmap_lock for reading. * Thus this install_breakpoint() can not make * is_trap_at_addr() true right after find_uprobe_rcu() * returns NULL in find_active_uprobe_rcu(). */ mmap_write_lock(mm); if (check_stable_address_space(mm)) goto unlock; vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; if (is_register) { /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, mm, vma, info->vaddr); } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, mm, info->vaddr); } unlock: mmap_write_unlock(mm); free: mmput(mm); info = free_map_info(info); } out: percpu_up_write(&dup_mmap_sem); return err; } /** * uprobe_unregister_nosync - unregister an already registered probe. * @uprobe: uprobe to remove * @uc: identify which probe if multiple probes are colocated. */ void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) { int err; down_write(&uprobe->register_rwsem); consumer_del(uprobe, uc); err = register_for_each_vma(uprobe, NULL); up_write(&uprobe->register_rwsem); /* TODO : cant unregister? schedule a worker thread */ if (unlikely(err)) { uprobe_warn(current, "unregister, leaking uprobe"); return; } put_uprobe(uprobe); } EXPORT_SYMBOL_GPL(uprobe_unregister_nosync); void uprobe_unregister_sync(void) { /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding * uprobe->register_rwsem, we need to wait for RCU grace period to * make sure that we can't call into just unregistered * uprobe_consumer's callbacks anymore. If we don't do that, fast and * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ synchronize_rcu_tasks_trace(); synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); /** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return: pointer to the new uprobe on success or an ERR_PTR on failure. */ struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { struct uprobe *uprobe; int ret; /* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) return ERR_PTR(-EINVAL); /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio && !shmem_mapping(inode->i_mapping)) return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return ERR_PTR(-EINVAL); /* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) return ERR_PTR(-EINVAL); uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) return uprobe; down_write(&uprobe->register_rwsem); consumer_add(uprobe, uc); ret = register_for_each_vma(uprobe, uc); up_write(&uprobe->register_rwsem); if (ret) { uprobe_unregister_nosync(uprobe, uc); /* * Registration might have partially succeeded, so we can have * this consumer being called right at this time. We need to * sync here. It's ok, it's unlikely slow path. */ uprobe_unregister_sync(); return ERR_PTR(ret); } return uprobe; } EXPORT_SYMBOL_GPL(uprobe_register); /** * uprobe_apply - add or remove the breakpoints according to @uc->filter * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints * Return: 0 on success or negative error code. */ int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; int ret = -ENOENT; down_write(&uprobe->register_rwsem); rcu_read_lock_trace(); list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); return ret; } static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; if (!valid_vma(vma, false) || file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; if (uprobe->offset < offset || uprobe->offset >= offset + vma->vm_end - vma->vm_start) continue; vaddr = offset_to_vaddr(vma, uprobe->offset); err |= remove_breakpoint(uprobe, mm, vaddr); } mmap_read_unlock(mm); return err; } static struct rb_node * find_node_in_range(struct inode *inode, loff_t min, loff_t max) { struct rb_node *n = uprobes_tree.rb_node; while (n) { struct uprobe *u = rb_entry(n, struct uprobe, rb_node); if (inode < u->inode) { n = n->rb_left; } else if (inode > u->inode) { n = n->rb_right; } else { if (max < u->offset) n = n->rb_left; else if (min > u->offset) n = n->rb_right; else break; } } return n; } /* * For a given range in vma, build a list of probes that need to be inserted. */ static void build_probe_list(struct inode *inode, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *head) { loff_t min, max; struct rb_node *n, *t; struct uprobe *u; INIT_LIST_HEAD(head); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u)) list_add(&u->pending_list, head); } } read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ static int delayed_ref_ctr_inc(struct vm_area_struct *vma) { struct list_head *pos, *q; struct delayed_uprobe *du; unsigned long vaddr; int ret = 0, err = 0; mutex_lock(&delayed_uprobe_lock); list_for_each_safe(pos, q, &delayed_uprobe_list) { du = list_entry(pos, struct delayed_uprobe, list); if (du->mm != vma->vm_mm || !valid_ref_ctr_vma(du->uprobe, vma)) continue; vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset); ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); if (ret) { update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); if (!err) err = ret; } delayed_uprobe_delete(du); } mutex_unlock(&delayed_uprobe_lock); return err; } /* * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; if (no_uprobe_events()) return 0; if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) return 0; inode = file_inode(vma->vm_file); if (!inode) return 0; mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); /* * We can race with uprobe_unregister(), this uprobe can be already * removed. But in this case filter_chain() must return false, all * consumers have gone away. */ list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) && filter_chain(uprobe, vma->vm_mm)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); return 0; } static bool vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) { loff_t min, max; struct inode *inode; struct rb_node *n; inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); read_unlock(&uprobes_treelock); return !!n; } /* * Called in context of a munmap of a vma. */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (no_uprobe_events() || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; if (vma_has_uprobes(vma, start, end)) set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { struct xol_area *area = vma->vm_mm->uprobes_state.xol_area; vmf->page = area->page; get_page(vmf->page); return 0; } static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { return -EPERM; } static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, .mremap = xol_mremap, }; /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { struct vm_area_struct *vma; int ret; if (mmap_write_lock_killable(mm)) return -EINTR; if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } } vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO| VM_SEALED_SYSMAP, &xol_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto fail; } ret = 0; /* pairs with get_xol_area() */ smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: mmap_write_unlock(mm); return ret; } void * __weak arch_uprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; *psize = UPROBE_SWBP_INSN_SIZE; return &insn; } static struct xol_area *__create_xol_area(unsigned long vaddr) { struct mm_struct *mm = current->mm; unsigned long insns_size; struct xol_area *area; void *insns; area = kzalloc(sizeof(*area), GFP_KERNEL); if (unlikely(!area)) goto out; area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), GFP_KERNEL); if (!area->bitmap) goto free_area; area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); if (!area->page) goto free_bitmap; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); insns = arch_uprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) return area; __free_page(area->page); free_bitmap: kfree(area->bitmap); free_area: kfree(area); out: return NULL; } /* * get_xol_area - Allocate process's xol_area if necessary. * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL. */ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; if (!mm->uprobes_state.xol_area) __create_xol_area(0); /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } /* * uprobe_clear_state - Free the area allocated for slots. */ void uprobe_clear_state(struct mm_struct *mm) { struct xol_area *area = mm->uprobes_state.xol_area; mutex_lock(&delayed_uprobe_lock); delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); if (!area) return; put_page(area->page); kfree(area->bitmap); kfree(area); } void uprobe_start_dup_mmap(void) { percpu_down_read(&dup_mmap_sem); } void uprobe_end_dup_mmap(void) { percpu_up_read(&dup_mmap_sem); } void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ set_bit(MMF_RECALC_UPROBES, &newmm->flags); } } static unsigned long xol_get_slot_nr(struct xol_area *area) { unsigned long slot_nr; slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); if (slot_nr < UINSNS_PER_PAGE) { if (!test_and_set_bit(slot_nr, area->bitmap)) return slot_nr; } return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. */ static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { struct xol_area *area = get_xol_area(); unsigned long slot_nr; if (!area) return false; wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return true; } /* * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ static void xol_free_insn_slot(struct uprobe_task *utask) { struct xol_area *area = current->mm->uprobes_state.xol_area; unsigned long offset = utask->xol_vaddr - area->vaddr; unsigned int slot_nr; utask->xol_vaddr = 0; /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; slot_nr = offset / UPROBE_XOL_SLOT_BYTES; clear_bit(slot_nr, area->bitmap); smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len) { /* Initialize the slot */ copy_to_page(page, vaddr, src, len); /* * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. */ flush_dcache_page(page); } /** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint * instruction. * Return the address of the breakpoint instruction. */ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) { return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE; } unsigned long uprobe_get_trap_addr(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (unlikely(utask && utask->active_uprobe)) return utask->vaddr; return instruction_pointer(regs); } static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri) { ri->cons_cnt = 0; ri->next = utask->ri_pool; utask->ri_pool = ri; } static struct return_instance *ri_pool_pop(struct uprobe_task *utask) { struct return_instance *ri = utask->ri_pool; if (likely(ri)) utask->ri_pool = ri->next; return ri; } static void ri_free(struct return_instance *ri) { kfree(ri->extra_consumers); kfree_rcu(ri, rcu); } static void free_ret_instance(struct uprobe_task *utask, struct return_instance *ri, bool cleanup_hprobe) { unsigned seq; if (cleanup_hprobe) { enum hprobe_state hstate; (void)hprobe_consume(&ri->hprobe, &hstate); hprobe_finalize(&ri->hprobe, hstate); } /* * At this point return_instance is unlinked from utask's * return_instances list and this has become visible to ri_timer(). * If seqcount now indicates that ri_timer's return instance * processing loop isn't active, we can return ri into the pool of * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. * Admittedly, this is a rather simple use of seqcount, but it nicely * abstracts away all the necessary memory barriers, so we use * a well-supported kernel primitive here. */ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */ ri_pool_push(utask, ri); } else { /* we might be racing with ri_timer(), so play it safe */ ri_free(ri); } } /* * Called with no locks held. * Called in context of an exiting or an exec-ing thread. */ void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; struct return_instance *ri, *ri_next; if (!utask) return; t->utask = NULL; WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) { ri_next = ri->next; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } /* free_ret_instance() above might add to ri_pool, so this loop should come last */ ri = utask->ri_pool; while (ri) { ri_next = ri->next; ri_free(ri); ri = ri_next; } kfree(utask); } #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ #define for_each_ret_instance_rcu(pos, head) \ for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) static void ri_timer(struct timer_list *timer) { struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); struct return_instance *ri; /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ guard(srcu)(&uretprobes_srcu); /* RCU protects return_instance from freeing. */ guard(rcu)(); /* * See free_ret_instance() for notes on seqcount use. * We also employ raw API variants to avoid lockdep false-positive * warning complaining about enabled preemption. The timer can only be * invoked once for a uprobe_task. Therefore there can only be one * writer. The reader does not require an even sequence count to make * progress, so it is OK to remain preemptible on PREEMPT_RT. */ raw_write_seqcount_begin(&utask->ri_seqcount); for_each_ret_instance_rcu(ri, utask->return_instances) hprobe_expire(&ri->hprobe, false); raw_write_seqcount_end(&utask->ri_seqcount); } static struct uprobe_task *alloc_utask(void) { struct uprobe_task *utask; utask = kzalloc(sizeof(*utask), GFP_KERNEL); if (!utask) return NULL; timer_setup(&utask->ri_timer, ri_timer, 0); seqcount_init(&utask->ri_seqcount); return utask; } /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise */ static struct uprobe_task *get_utask(void) { if (!current->utask) current->utask = alloc_utask(); return current->utask; } static struct return_instance *alloc_return_instance(struct uprobe_task *utask) { struct return_instance *ri; ri = ri_pool_pop(utask); if (ri) return ri; ri = kzalloc(sizeof(*ri), GFP_KERNEL); if (!ri) return ZERO_SIZE_PTR; return ri; } static struct return_instance *dup_return_instance(struct return_instance *old) { struct return_instance *ri; ri = kmemdup(old, sizeof(*ri), GFP_KERNEL); if (!ri) return NULL; if (unlikely(old->cons_cnt > 1)) { ri->extra_consumers = kmemdup(old->extra_consumers, sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1), GFP_KERNEL); if (!ri->extra_consumers) { kfree(ri); return NULL; } } return ri; } static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; struct uprobe *uprobe; n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; /* protect uprobes from freeing, we'll need try_get_uprobe() them */ guard(srcu)(&uretprobes_srcu); p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { n = dup_return_instance(o); if (!n) return -ENOMEM; /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ uprobe = hprobe_expire(&o->hprobe, true); /* * New utask will have stable properly refcounted uprobe or * NULL. Even if we failed to get refcounted uprobe, we still * need to preserve full set of return_instances for proper * uretprobe handling and nesting in forked task. */ hprobe_init_stable(&n->hprobe, uprobe); n->next = NULL; rcu_assign_pointer(*p, n); p = &n->next; n_utask->depth++; } return 0; } static void dup_xol_work(struct callback_head *work) { if (current->flags & PF_EXITING) return; if (!__create_xol_area(current->utask->dup_xol_addr) && !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } /* * Called in context of a new clone/fork from copy_process. */ void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; struct xol_area *area; t->utask = NULL; if (!utask || !utask->return_instances) return; if (mm == t->mm && !(flags & CLONE_VFORK)) return; if (dup_utask(t, utask)) return uprobe_warn(t, "dup ret instances"); /* The task can fork() after dup_xol_work() fails */ area = mm->uprobes_state.xol_area; if (!area) return uprobe_warn(t, "dup xol area"); if (mm == t->mm) return; t->utask->dup_xol_addr = area->vaddr; init_task_work(&t->utask->dup_xol_work, dup_xol_work); task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME); } /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * * Returns -1 in case the xol_area is not allocated. */ unsigned long uprobe_get_trampoline_vaddr(void) { unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area; /* Pairs with xol_add_vma() smp_store_release() */ area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; return trampoline_vaddr; } static void cleanup_return_instances(struct uprobe_task *utask, bool chained, struct pt_regs *regs) { struct return_instance *ri = utask->return_instances, *ri_next; enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; free_ret_instance(utask, ri, true /* cleanup_hprobe */); ri = ri_next; } } static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, struct return_instance *ri) { struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; int srcu_idx; if (!get_xol_area()) goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); goto free; } trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); cleanup_return_instances(utask, chained, regs); /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ uprobe_warn(current, "handle tail call"); goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } /* __srcu_read_lock() because SRCU lock survives switch to user space */ srcu_idx = __srcu_read_lock(&uretprobes_srcu); ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; rcu_assign_pointer(utask->return_instances, ri); mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; free: ri_free(ri); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { struct uprobe_task *utask = current->utask; int err; if (!try_get_uprobe(uprobe)) return -EINVAL; if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } utask->vaddr = bp_vaddr; err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { xol_free_insn_slot(utask); goto err_out; } utask->active_uprobe = uprobe; utask->state = UTASK_SSTEP; return 0; err_out: put_uprobe(uprobe); return err; } /* * If we are singlestepping, then ensure this thread is not connected to * non-fatal signals until completion of singlestep. When xol insn itself * triggers the signal, restart the original insn even if the task is * already SIGKILL'ed (since coredump should report the correct ip). This * is even more important if the task has a handler for SIGSEGV/etc, The * _same_ instruction should be repeated again after return from the signal * handler, and SSTEP can never finish in this case. */ bool uprobe_deny_signal(void) { struct task_struct *t = current; struct uprobe_task *utask = t->utask; if (likely(!utask || !utask->active_uprobe)) return false; WARN_ON_ONCE(utask->state != UTASK_SSTEP); if (task_sigpending(t)) { utask->signal_denied = true; clear_tsk_thread_flag(t, TIF_SIGPENDING); if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) { utask->state = UTASK_SSTEP_TRAPPED; set_tsk_thread_flag(t, TIF_UPROBE); } } return true; } static void mmf_recalc_uprobes(struct mm_struct *mm) { VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. * Or this uprobe can be filtered out. */ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return; } clear_bit(MMF_HAS_UPROBES, &mm->flags); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; int result; if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) return -EINVAL; pagefault_disable(); result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) goto out; result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result; copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; struct file *vm_file; loff_t offset; unsigned int seq; guard(rcu)(); if (!mmap_lock_speculate_try_begin(mm, &seq)) return NULL; vma = vma_lookup(mm, bp_vaddr); if (!vma) return NULL; /* * vm_file memory can be reused for another instance of struct file, * but can't be freed from under us, so it's safe to read fields from * it, even if the values are some garbage values; ultimately * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure * that whatever we speculatively found is correct */ vm_file = READ_ONCE(vma->vm_file); if (!vm_file) return NULL; offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start); uprobe = find_uprobe_rcu(vm_file->f_inode, offset); if (!uprobe) return NULL; /* now double check that nothing about MM changed */ if (mmap_lock_speculate_retry(mm, seq)) return NULL; return uprobe; } /* assumes being inside RCU protected region */ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; struct uprobe *uprobe = NULL; struct vm_area_struct *vma; uprobe = find_active_uprobe_speculative(bp_vaddr); if (uprobe) return uprobe; mmap_read_lock(mm); vma = vma_lookup(mm, bp_vaddr); if (vma) { if (vma->vm_file) { struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe_rcu(inode, offset); } if (!uprobe) *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); return uprobe; } static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie) { struct return_consumer *ric; if (unlikely(ri == ZERO_SIZE_PTR)) return ri; if (unlikely(ri->cons_cnt > 0)) { ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL); if (!ric) { ri_free(ri); return ZERO_SIZE_PTR; } ri->extra_consumers = ric; } ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1]; ric->id = id; ric->cookie = cookie; ri->cons_cnt++; return ri; } static struct return_consumer * return_consumer_find(struct return_instance *ri, int *iter, int id) { struct return_consumer *ric; int idx; for (idx = *iter; idx < ri->cons_cnt; idx++) { ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1]; if (ric->id == id) { *iter = idx + 1; return ric; } } return NULL; } static bool ignore_ret_handler(int rc) { return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; } static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; bool has_consumers = false, remove = true; struct return_instance *ri = NULL; struct uprobe_task *utask = current->utask; utask->auprobe = &uprobe->arch; list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; __u64 cookie = 0; int rc = 0; if (uc->handler) { rc = uc->handler(uc, regs, &cookie); WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; if (!uc->ret_handler || ignore_ret_handler(rc)) continue; if (!ri) ri = alloc_return_instance(utask); if (session) ri = push_consumer(ri, uc->id, cookie); } utask->auprobe = NULL; if (!ZERO_OR_NULL_PTR(ri)) prepare_uretprobe(uprobe, regs, ri); if (remove && has_consumers) { down_read(&uprobe->register_rwsem); /* re-check that removal is still required, this time under lock */ if (!filter_chain(uprobe, current->mm)) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); } up_read(&uprobe->register_rwsem); } } static void handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { struct return_consumer *ric; struct uprobe_consumer *uc; int ric_idx = 0; /* all consumers unsubscribed meanwhile */ if (unlikely(!uprobe)) return; rcu_read_lock_trace(); list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { bool session = uc->handler && uc->ret_handler; if (uc->ret_handler) { ric = return_consumer_find(ri, &ric_idx, uc->id); if (!session || ric) uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); } } rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) { bool chained; do { chained = ri->chained; ri = ri->next; /* can't be NULL if chained */ } while (chained); return ri; } void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *ri_next, *next_chain; struct uprobe *uprobe; enum hprobe_state hstate; bool valid; utask = current->utask; if (!utask) goto sigill; ri = utask->return_instances; if (!ri) goto sigill; do { /* * We should throw out the frames invalidated by longjmp(). * If this chain is valid, then the next one should be alive * or NULL; the latter case means that nobody but ri->func * could hit this trampoline on return. TODO: sigaltstack(). */ next_chain = find_next_ret_chain(ri); valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs); instruction_pointer_set(regs, ri->orig_ret_vaddr); do { /* pop current instance from the stack of pending return instances, * as it's not pending anymore: we just fixed up original * instruction pointer in regs and are about to call handlers; * this allows fixup_uretprobe_trampoline_entries() to properly fix up * captured stack traces from uretprobe handlers, in which pending * trampoline addresses on the stack are replaced with correct * original return addresses */ ri_next = ri->next; rcu_assign_pointer(utask->return_instances, ri_next); utask->depth--; uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) handle_uretprobe_chain(ri, uprobe, regs); hprobe_finalize(&ri->hprobe, hstate); /* We already took care of hprobe, no need to waste more time on that. */ free_ret_instance(utask, ri, false /* !cleanup_hprobe */); ri = ri_next; } while (ri != next_chain); } while (!valid); return; sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) { return false; } bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, struct pt_regs *regs) { return true; } /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. */ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ force_sig(SIGTRAP); } else { /* * Either we raced with uprobe_unregister() or we can't * access this memory. The latter is only possible if * another thread plays with our ->mm. In both cases * we can simply restart. If this vma was unmapped we * can pretend this insn was not executed yet and get * the (correct) SIGSEGV after restart. */ instruction_pointer_set(regs, bp_vaddr); } goto out; } /* change it in advance for ->handler() and restart */ instruction_pointer_set(regs, bp_vaddr); /* * TODO: move copy_insn/etc into _register and remove this hack. * After we hit the bp, _unregister + _register can install the * new and not-yet-analyzed uprobe at the same address, restart. */ if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) goto out; /* * Pairs with the smp_wmb() in prepare_uprobe(). * * Guarantees that if we see the UPROBE_COPY_INSN bit set, then * we must also see the stores to &uprobe->arch performed by the * prepare_uprobe() call. */ smp_rmb(); /* Tracing handlers use ->utask to communicate with fetch methods */ if (!get_utask()) goto out; if (arch_uprobe_ignore(&uprobe->arch, regs)) goto out; handler_chain(uprobe, regs); if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (pre_ssout(uprobe, regs, bp_vaddr)) goto out; out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ rcu_read_unlock_trace(); } /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. */ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else WARN_ON_ONCE(1); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; xol_free_insn_slot(utask); if (utask->signal_denied) { set_thread_flag(TIF_SIGPENDING); utask->signal_denied = false; } if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); force_sig(SIGILL); } } /* * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and * allows the thread to return from interrupt. After that handle_swbp() * sets utask->active_uprobe. * * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag * and allows the thread to return from interrupt. * * While returning to userspace, thread notices the TIF_UPROBE flag and calls * uprobe_notify_resume(). */ void uprobe_notify_resume(struct pt_regs *regs) { struct uprobe_task *utask; clear_thread_flag(TIF_UPROBE); utask = current->utask; if (utask && utask->active_uprobe) handle_singlestep(utask, regs); else handle_swbp(regs); } /* * uprobe_pre_sstep_notifier gets called from interrupt context as part of * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit. */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { if (!current->mm) return 0; if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) && (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); return 1; } /* * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep. */ int uprobe_post_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask = current->utask; if (!current->mm || !utask || !utask->active_uprobe) /* task is currently not uprobed */ return 0; utask->state = UTASK_SSTEP_ACK; set_thread_flag(TIF_UPROBE); return 1; } static struct notifier_block uprobe_exception_nb = { .notifier_call = arch_uprobe_exception_notify, .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); BUG_ON(register_die_notifier(&uprobe_exception_nb)); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0-or-later /* * Cryptographic API. * * Cipher operations. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * 2002 Adam J. Richter <adam@yggdrasil.com> * 2004 Jean-Luc Cooke <jlcooke@certainkey.com> */ #include <crypto/scatterwalk.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/scatterlist.h> void scatterwalk_skip(struct scatter_walk *walk, unsigned int nbytes) { struct scatterlist *sg = walk->sg; nbytes += walk->offset - sg->offset; while (nbytes > sg->length) { nbytes -= sg->length; sg = sg_next(sg); } walk->sg = sg; walk->offset = sg->offset + nbytes; } EXPORT_SYMBOL_GPL(scatterwalk_skip); inline void memcpy_from_scatterwalk(void *buf, struct scatter_walk *walk, unsigned int nbytes) { do { unsigned int to_copy; to_copy = scatterwalk_next(walk, nbytes); memcpy(buf, walk->addr, to_copy); scatterwalk_done_src(walk, to_copy); buf += to_copy; nbytes -= to_copy; } while (nbytes); } EXPORT_SYMBOL_GPL(memcpy_from_scatterwalk); inline void memcpy_to_scatterwalk(struct scatter_walk *walk, const void *buf, unsigned int nbytes) { do { unsigned int to_copy; to_copy = scatterwalk_next(walk, nbytes); memcpy(walk->addr, buf, to_copy); scatterwalk_done_dst(walk, to_copy); buf += to_copy; nbytes -= to_copy; } while (nbytes); } EXPORT_SYMBOL_GPL(memcpy_to_scatterwalk); void memcpy_from_sglist(void *buf, struct scatterlist *sg, unsigned int start, unsigned int nbytes) { struct scatter_walk walk; if (unlikely(nbytes == 0)) /* in case sg == NULL */ return; scatterwalk_start_at_pos(&walk, sg, start); memcpy_from_scatterwalk(buf, &walk, nbytes); } EXPORT_SYMBOL_GPL(memcpy_from_sglist); void memcpy_to_sglist(struct scatterlist *sg, unsigned int start, const void *buf, unsigned int nbytes) { struct scatter_walk walk; if (unlikely(nbytes == 0)) /* in case sg == NULL */ return; scatterwalk_start_at_pos(&walk, sg, start); memcpy_to_scatterwalk(&walk, buf, nbytes); } EXPORT_SYMBOL_GPL(memcpy_to_sglist); void memcpy_sglist(struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { struct scatter_walk swalk; struct scatter_walk dwalk; if (unlikely(nbytes == 0)) /* in case sg == NULL */ return; scatterwalk_start(&swalk, src); scatterwalk_start(&dwalk, dst); do { unsigned int slen, dlen; unsigned int len; slen = scatterwalk_next(&swalk, nbytes); dlen = scatterwalk_next(&dwalk, nbytes); len = min(slen, dlen); memcpy(dwalk.addr, swalk.addr, len); scatterwalk_done_dst(&dwalk, len); scatterwalk_done_src(&swalk, len); nbytes -= len; } while (nbytes); } EXPORT_SYMBOL_GPL(memcpy_sglist); struct scatterlist *scatterwalk_ffwd(struct scatterlist dst[2], struct scatterlist *src, unsigned int len) { for (;;) { if (!len) return src; if (src->length > len) break; len -= src->length; src = sg_next(src); } sg_init_table(dst, 2); sg_set_page(dst, sg_page(src), src->length - len, src->offset + len); scatterwalk_crypto_chain(dst, sg_next(src), 2); return dst; } EXPORT_SYMBOL_GPL(scatterwalk_ffwd);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> #include <linux/gfp.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/ratelimit.h> #include <linux/bits.h> #include <linux/ktime.h> #include <linux/bitmap.h> #include <linux/mnt_idmapping.h> #include "super.h" #include "mds_client.h" #include "crypto.h" #include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) /* * A cluster of MDS (metadata server) daemons is responsible for * managing the file system namespace (the directory hierarchy and * inodes) and for coordinating shared access to storage. Metadata is * partitioning hierarchically across a number of servers, and that * partition varies over time as the cluster adjusts the distribution * in order to balance load. * * The MDS client is primarily responsible to managing synchronous * metadata requests for operations like open, unlink, and so forth. * If there is a MDS failure, we find out about it when we (possibly * request and) receive a new MDS map, and can resubmit affected * requests. * * For the most part, though, we take advantage of a lossless * communications channel to the MDS, and do not need to worry about * timing out or resubmitting requests. * * We maintain a stateful "session" with each MDS we interact with. * Within each session, we sent periodic heartbeat messages to ensure * any capabilities or leases we have been issues remain valid. If * the session times out and goes stale, our leases and capabilities * are no longer valid. */ struct ceph_reconnect_state { struct ceph_mds_session *session; int nr_caps, nr_realms; struct ceph_pagelist *pagelist; unsigned msg_version; bool allow_multi; }; static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); static void ceph_cap_release_work(struct work_struct *work); static void ceph_cap_reclaim_work(struct work_struct *work); static const struct ceph_connection_operations mds_con_ops; /* * mds reply parsing */ static int parse_reply_info_quota(void **p, void *end, struct ceph_mds_reply_info_in *info) { u8 struct_v, struct_compat; u32 struct_len; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only * understand encoding with struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); end = *p + struct_len; ceph_decode_64_safe(p, end, info->max_bytes, bad); ceph_decode_64_safe(p, end, info->max_files, bad); *p = end; return 0; bad: return -EIO; } /* * parse individual inode info */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, u64 features) { int err = 0; u8 struct_v = 0; if (features == (u64)-1) { u32 struct_len; u8 struct_compat; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only understand * encoding with struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); end = *p + struct_len; } ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad); info->in = *p; *p += sizeof(struct ceph_mds_reply_inode) + sizeof(*info->in->fragtree.splits) * le32_to_cpu(info->in->fragtree.nsplits); ceph_decode_32_safe(p, end, info->symlink_len, bad); ceph_decode_need(p, end, info->symlink_len, bad); info->symlink = *p; *p += info->symlink_len; ceph_decode_copy_safe(p, end, &info->dir_layout, sizeof(info->dir_layout), bad); ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; *p += info->xattr_len; if (features == (u64)-1) { /* inline data */ ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); ceph_decode_need(p, end, info->inline_len, bad); info->inline_data = *p; *p += info->inline_len; /* quota */ err = parse_reply_info_quota(p, end, info); if (err < 0) goto out_bad; /* pool namespace */ ceph_decode_32_safe(p, end, info->pool_ns_len, bad); if (info->pool_ns_len > 0) { ceph_decode_need(p, end, info->pool_ns_len, bad); info->pool_ns_data = *p; *p += info->pool_ns_len; } /* btime */ ceph_decode_need(p, end, sizeof(info->btime), bad); ceph_decode_copy(p, &info->btime, sizeof(info->btime)); /* change attribute */ ceph_decode_64_safe(p, end, info->change_attr, bad); /* dir pin */ if (struct_v >= 2) { ceph_decode_32_safe(p, end, info->dir_pin, bad); } else { info->dir_pin = -ENODATA; } /* snapshot birth time, remains zero for v<=2 */ if (struct_v >= 3) { ceph_decode_need(p, end, sizeof(info->snap_btime), bad); ceph_decode_copy(p, &info->snap_btime, sizeof(info->snap_btime)); } else { memset(&info->snap_btime, 0, sizeof(info->snap_btime)); } /* snapshot count, remains zero for v<=3 */ if (struct_v >= 4) { ceph_decode_64_safe(p, end, info->rsnaps, bad); } else { info->rsnaps = 0; } if (struct_v >= 5) { u32 alen; ceph_decode_32_safe(p, end, alen, bad); while (alen--) { u32 len; /* key */ ceph_decode_32_safe(p, end, len, bad); ceph_decode_skip_n(p, end, len, bad); /* value */ ceph_decode_32_safe(p, end, len, bad); ceph_decode_skip_n(p, end, len, bad); } } /* fscrypt flag -- ignore */ if (struct_v >= 6) ceph_decode_skip_8(p, end, bad); info->fscrypt_auth = NULL; info->fscrypt_auth_len = 0; info->fscrypt_file = NULL; info->fscrypt_file_len = 0; if (struct_v >= 7) { ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad); if (info->fscrypt_auth_len) { info->fscrypt_auth = kmalloc(info->fscrypt_auth_len, GFP_KERNEL); if (!info->fscrypt_auth) return -ENOMEM; ceph_decode_copy_safe(p, end, info->fscrypt_auth, info->fscrypt_auth_len, bad); } ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad); if (info->fscrypt_file_len) { info->fscrypt_file = kmalloc(info->fscrypt_file_len, GFP_KERNEL); if (!info->fscrypt_file) return -ENOMEM; ceph_decode_copy_safe(p, end, info->fscrypt_file, info->fscrypt_file_len, bad); } } *p = end; } else { /* legacy (unversioned) struct */ if (features & CEPH_FEATURE_MDS_INLINE_DATA) { ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); ceph_decode_need(p, end, info->inline_len, bad); info->inline_data = *p; *p += info->inline_len; } else info->inline_version = CEPH_INLINE_NONE; if (features & CEPH_FEATURE_MDS_QUOTA) { err = parse_reply_info_quota(p, end, info); if (err < 0) goto out_bad; } else { info->max_bytes = 0; info->max_files = 0; } info->pool_ns_len = 0; info->pool_ns_data = NULL; if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { ceph_decode_32_safe(p, end, info->pool_ns_len, bad); if (info->pool_ns_len > 0) { ceph_decode_need(p, end, info->pool_ns_len, bad); info->pool_ns_data = *p; *p += info->pool_ns_len; } } if (features & CEPH_FEATURE_FS_BTIME) { ceph_decode_need(p, end, sizeof(info->btime), bad); ceph_decode_copy(p, &info->btime, sizeof(info->btime)); ceph_decode_64_safe(p, end, info->change_attr, bad); } info->dir_pin = -ENODATA; /* info->snap_btime and info->rsnaps remain zero */ } return 0; bad: err = -EIO; out_bad: return err; } static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_dirfrag **dirfrag, u64 features) { if (features == (u64)-1) { u8 struct_v, struct_compat; u32 struct_len; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only understand * encoding whose struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); end = *p + struct_len; } ceph_decode_need(p, end, sizeof(**dirfrag), bad); *dirfrag = *p; *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist); if (unlikely(*p > end)) goto bad; if (features == (u64)-1) *p = end; return 0; bad: return -EIO; } static int parse_reply_info_lease(void **p, void *end, struct ceph_mds_reply_lease **lease, u64 features, u32 *altname_len, u8 **altname) { u8 struct_v; u32 struct_len; void *lend; if (features == (u64)-1) { u8 struct_compat; ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); /* struct_v is expected to be >= 1. we only understand * encoding whose struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); } else { struct_len = sizeof(**lease); *altname_len = 0; *altname = NULL; } lend = *p + struct_len; ceph_decode_need(p, end, struct_len, bad); *lease = *p; *p += sizeof(**lease); if (features == (u64)-1) { if (struct_v >= 2) { ceph_decode_32_safe(p, end, *altname_len, bad); ceph_decode_need(p, end, *altname_len, bad); *altname = *p; *p += *altname_len; } else { *altname = NULL; *altname_len = 0; } } *p = lend; return 0; bad: return -EIO; } /* * parse a normal reply, which may contain a (dir+)dentry and/or a * target inode. */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { int err; if (info->head->is_dentry) { err = parse_reply_info_in(p, end, &info->diri, features); if (err < 0) goto out_bad; err = parse_reply_info_dir(p, end, &info->dirfrag, features); if (err < 0) goto out_bad; ceph_decode_32_safe(p, end, info->dname_len, bad); ceph_decode_need(p, end, info->dname_len, bad); info->dname = *p; *p += info->dname_len; err = parse_reply_info_lease(p, end, &info->dlease, features, &info->altname_len, &info->altname); if (err < 0) goto out_bad; } if (info->head->is_target) { err = parse_reply_info_in(p, end, &info->targeti, features); if (err < 0) goto out_bad; } if (unlikely(*p != end)) goto bad; return 0; bad: err = -EIO; out_bad: pr_err("problem parsing mds trace %d\n", err); return err; } /* * parse readdir results */ static int parse_reply_info_readdir(void **p, void *end, struct ceph_mds_request *req, u64 features) { struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; struct ceph_client *cl = req->r_mdsc->fsc->client; u32 num, i = 0; int err; err = parse_reply_info_dir(p, end, &info->dir_dir, features); if (err < 0) goto out_bad; ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); { u16 flags = ceph_decode_16(p); info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH); } if (num == 0) goto done; BUG_ON(!info->dir_entries); if ((unsigned long)(info->dir_entries + num) > (unsigned long)info->dir_entries + info->dir_buf_size) { pr_err_client(cl, "dir contents are larger than expected\n"); WARN_ON(1); goto bad; } info->dir_nr = num; while (num) { struct inode *inode = d_inode(req->r_dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; struct fscrypt_str tname = FSTR_INIT(NULL, 0); struct fscrypt_str oname = FSTR_INIT(NULL, 0); struct ceph_fname fname; u32 altname_len, _name_len; u8 *altname, *_name; /* dentry */ ceph_decode_32_safe(p, end, _name_len, bad); ceph_decode_need(p, end, _name_len, bad); _name = *p; *p += _name_len; doutc(cl, "parsed dir dname '%.*s'\n", _name_len, _name); if (info->hash_order) rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, _name, _name_len); /* dentry lease */ err = parse_reply_info_lease(p, end, &rde->lease, features, &altname_len, &altname); if (err) goto out_bad; /* * Try to dencrypt the dentry names and update them * in the ceph_mds_reply_dir_entry struct. */ fname.dir = inode; fname.name = _name; fname.name_len = _name_len; fname.ctext = altname; fname.ctext_len = altname_len; /* * The _name_len maybe larger than altname_len, such as * when the human readable name length is in range of * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE), * then the copy in ceph_fname_to_usr will corrupt the * data if there has no encryption key. * * Just set the no_copy flag and then if there has no * encryption key the oname.name will be assigned to * _name always. */ fname.no_copy = true; if (altname_len == 0) { /* * Set tname to _name, and this will be used * to do the base64_decode in-place. It's * safe because the decoded string should * always be shorter, which is 3/4 of origin * string. */ tname.name = _name; /* * Set oname to _name too, and this will be * used to do the dencryption in-place. */ oname.name = _name; oname.len = _name_len; } else { /* * This will do the decryption only in-place * from altname cryptext directly. */ oname.name = altname; oname.len = altname_len; } rde->is_nokey = false; err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey); if (err) { pr_err_client(cl, "unable to decode %.*s, got %d\n", _name_len, _name, err); goto out_bad; } rde->name = oname.name; rde->name_len = oname.len; /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; /* ceph_readdir_prepopulate() will update it */ rde->offset = 0; i++; num--; } done: /* Skip over any unrecognized fields */ *p = end; return 0; bad: err = -EIO; out_bad: pr_err_client(cl, "problem parsing dir contents %d\n", err); return err; } /* * parse fcntl F_GETLK results */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; info->filelock_reply = *p; /* Skip over any unrecognized fields */ *p = end; return 0; bad: return -EIO; } #if BITS_PER_LONG == 64 #define DELEGATED_INO_AVAILABLE xa_mk_value(1) static int ceph_parse_deleg_inos(void **p, void *end, struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; u32 sets; ceph_decode_32_safe(p, end, sets, bad); doutc(cl, "got %u sets of delegated inodes\n", sets); while (sets--) { u64 start, len; ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); /* Don't accept a delegation of system inodes */ if (start < CEPH_INO_SYSTEM_BASE) { pr_warn_ratelimited_client(cl, "ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", start, len); continue; } while (len--) { int err = xa_insert(&s->s_delegated_inos, start++, DELEGATED_INO_AVAILABLE, GFP_KERNEL); if (!err) { doutc(cl, "added delegated inode 0x%llx\n", start - 1); } else if (err == -EBUSY) { pr_warn_client(cl, "MDS delegated inode 0x%llx more than once.\n", start - 1); } else { return err; } } } return 0; bad: return -EIO; } u64 ceph_get_deleg_ino(struct ceph_mds_session *s) { unsigned long ino; void *val; xa_for_each(&s->s_delegated_inos, ino, val) { val = xa_erase(&s->s_delegated_inos, ino); if (val == DELEGATED_INO_AVAILABLE) return ino; } return 0; } int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) { return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE, GFP_KERNEL); } #else /* BITS_PER_LONG == 64 */ /* * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top * and bottom words? */ static int ceph_parse_deleg_inos(void **p, void *end, struct ceph_mds_session *s) { u32 sets; ceph_decode_32_safe(p, end, sets, bad); if (sets) ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad); return 0; bad: return -EIO; } u64 ceph_get_deleg_ino(struct ceph_mds_session *s) { return 0; } int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) { return 0; } #endif /* BITS_PER_LONG == 64 */ /* * parse create results */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features, struct ceph_mds_session *s) { int ret; if (features == (u64)-1 || (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { if (*p == end) { /* Malformed reply? */ info->has_create_ino = false; } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { info->has_create_ino = true; /* struct_v, struct_compat, and len */ ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad); ceph_decode_64_safe(p, end, info->ino, bad); ret = ceph_parse_deleg_inos(p, end, s); if (ret) return ret; } else { /* legacy */ ceph_decode_64_safe(p, end, info->ino, bad); info->has_create_ino = true; } } else { if (*p != end) goto bad; } /* Skip over any unrecognized fields */ *p = end; return 0; bad: return -EIO; } static int parse_reply_info_getvxattr(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { u32 value_len; ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ ceph_decode_skip_32(p, end, bad); /* skip payload length */ ceph_decode_32_safe(p, end, value_len, bad); if (value_len == end - *p) { info->xattr_info.xattr_value = *p; info->xattr_info.xattr_value_len = value_len; *p = end; return value_len; } bad: return -EIO; } /* * parse extra results */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_request *req, u64 features, struct ceph_mds_session *s) { struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; u32 op = le32_to_cpu(info->head->op); if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_readdir(p, end, req, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); else if (op == CEPH_MDS_OP_GETVXATTR) return parse_reply_info_getvxattr(p, end, info, features); else return -EIO; } /* * parse entire mds reply */ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, struct ceph_mds_request *req, u64 features) { struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; struct ceph_client *cl = s->s_mdsc->fsc->client; void *p, *end; u32 len; int err; info->head = msg->front.iov_base; p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; } /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); err = parse_reply_info_extra(&p, p+len, req, features, s); if (err < 0) goto out_bad; } /* snap blob */ ceph_decode_32_safe(&p, end, len, bad); info->snapblob_len = len; info->snapblob = p; p += len; if (p != end) goto bad; return 0; bad: err = -EIO; out_bad: pr_err_client(cl, "mds parse_reply err %d\n", err); ceph_msg_dump(msg); return err; } static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { int i; kfree(info->diri.fscrypt_auth); kfree(info->diri.fscrypt_file); kfree(info->targeti.fscrypt_auth); kfree(info->targeti.fscrypt_file); if (!info->dir_entries) return; for (i = 0; i < info->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; kfree(rde->inode.fscrypt_auth); kfree(rde->inode.fscrypt_file); } free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } /* * In async unlink case the kclient won't wait for the first reply * from MDS and just drop all the links and unhash the dentry and then * succeeds immediately. * * For any new create/link/rename,etc requests followed by using the * same file names we must wait for the first reply of the inflight * unlink request, or the MDS possibly will fail these following * requests with -EEXIST if the inflight async unlink request was * delayed for some reasons. * * And the worst case is that for the none async openc request it will * successfully open the file if the CDentry hasn't been unlinked yet, * but later the previous delayed async unlink request will remove the * CDentry. That means the just created file is possibly deleted later * by accident. * * We need to wait for the inflight async unlink requests to finish * when creating new files/directories by using the same file names. */ int ceph_wait_on_conflict_unlink(struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(dentry->d_sb); struct ceph_client *cl = fsc->client; struct dentry *pdentry = dentry->d_parent; struct dentry *udentry, *found = NULL; struct ceph_dentry_info *di; struct qstr dname; u32 hash = dentry->d_name.hash; int err; dname.name = dentry->d_name.name; dname.len = dentry->d_name.len; rcu_read_lock(); hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, hnode, hash) { udentry = di->dentry; spin_lock(&udentry->d_lock); if (udentry->d_name.hash != hash) goto next; if (unlikely(udentry->d_parent != pdentry)) goto next; if (!hash_hashed(&di->hnode)) goto next; if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) pr_warn_client(cl, "dentry %p:%pd async unlink bit is not set\n", dentry, dentry); if (!d_same_name(udentry, pdentry, &dname)) goto next; found = dget_dlock(udentry); spin_unlock(&udentry->d_lock); break; next: spin_unlock(&udentry->d_lock); } rcu_read_unlock(); if (likely(!found)) return 0; doutc(cl, "dentry %p:%pd conflict with old %p:%pd\n", dentry, dentry, found, found); err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, TASK_KILLABLE); dput(found); return err; } /* * sessions */ const char *ceph_session_state_name(int s) { switch (s) { case CEPH_MDS_SESSION_NEW: return "new"; case CEPH_MDS_SESSION_OPENING: return "opening"; case CEPH_MDS_SESSION_OPEN: return "open"; case CEPH_MDS_SESSION_HUNG: return "hung"; case CEPH_MDS_SESSION_CLOSING: return "closing"; case CEPH_MDS_SESSION_CLOSED: return "closed"; case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; case CEPH_MDS_SESSION_REJECTED: return "rejected"; default: return "???"; } } struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) { if (refcount_inc_not_zero(&s->s_ref)) return s; return NULL; } void ceph_put_mds_session(struct ceph_mds_session *s) { if (IS_ERR_OR_NULL(s)) return; if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); WARN_ON(mutex_is_locked(&s->s_mutex)); xa_destroy(&s->s_delegated_inos); kfree(s); } } /* * called under mdsc->mutex */ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, int mds) { if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; return ceph_get_mds_session(mdsc->sessions[mds]); } static bool __have_session(struct ceph_mds_client *mdsc, int mds) { if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return false; else return true; } static int __verify_registered_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { if (s->s_mds >= mdsc->max_sessions || mdsc->sessions[s->s_mds] != s) return -ENOENT; return 0; } /* * create+register a new session for given mds. * called under mdsc->mutex. */ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, int mds) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *s; if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) return ERR_PTR(-EIO); if (mds >= mdsc->mdsmap->possible_max_rank) return ERR_PTR(-EINVAL); s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); if (mds >= mdsc->max_sessions) { int newmax = 1 << get_count_order(mds + 1); struct ceph_mds_session **sa; doutc(cl, "realloc to %d\n", newmax); sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); if (!sa) goto fail_realloc; if (mdsc->sessions) { memcpy(sa, mdsc->sessions, mdsc->max_sessions * sizeof(void *)); kfree(mdsc->sessions); } mdsc->sessions = sa; mdsc->max_sessions = newmax; } doutc(cl, "mds%d\n", mds); s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; mutex_init(&s->s_mutex); ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); atomic_set(&s->s_cap_gen, 1); s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); INIT_LIST_HEAD(&s->s_caps); refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); xa_init(&s->s_delegated_inos); INIT_LIST_HEAD(&s->s_cap_releases); INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); INIT_LIST_HEAD(&s->s_cap_dirty); INIT_LIST_HEAD(&s->s_cap_flushing); mdsc->sessions[mds] = s; atomic_inc(&mdsc->num_sessions); refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); return s; fail_realloc: kfree(s); return ERR_PTR(-ENOMEM); } /* * called under mdsc->mutex */ static void __unregister_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { doutc(mdsc->fsc->client, "mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); ceph_put_mds_session(s); atomic_dec(&mdsc->num_sessions); } /* * drop session refs in request. * * should be last request ref, or hold mdsc->mutex */ static void put_request_session(struct ceph_mds_request *req) { if (req->r_session) { ceph_put_mds_session(req->r_session); req->r_session = NULL; } } void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, void (*cb)(struct ceph_mds_session *), bool check_state) { int mds; mutex_lock(&mdsc->mutex); for (mds = 0; mds < mdsc->max_sessions; ++mds) { struct ceph_mds_session *s; s = __ceph_lookup_mds_session(mdsc, mds); if (!s) continue; if (check_state && !check_session_state(s)) { ceph_put_mds_session(s); continue; } mutex_unlock(&mdsc->mutex); cb(s); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); } void ceph_mdsc_release_request(struct kref *kref) { struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); ceph_mdsc_release_dir_caps_async(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); if (req->r_reply) ceph_msg_put(req->r_reply); if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } if (req->r_parent) { ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); iput(req->r_parent); } iput(req->r_target_inode); iput(req->r_new_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) dput(req->r_old_dentry); if (req->r_old_dentry_dir) { /* * track (and drop pins for) r_old_dentry_dir * separately, since r_old_dentry's d_parent may have * changed between the dir mutex being dropped and * this request being freed. */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); put_cred(req->r_cred); if (req->r_mnt_idmap) mnt_idmap_put(req->r_mnt_idmap); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); kfree(req->r_fscrypt_auth); kfree(req->r_altname); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); kmem_cache_free(ceph_mds_request_cachep, req); } DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ static struct ceph_mds_request * lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; req = lookup_request(&mdsc->request_tree, tid); if (req) ceph_mdsc_get_request(req); return req; } /* * Register an in-flight request, and assign a tid. Link to directory * are modifying (if any). * * Called under mdsc->mutex. */ static void __register_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, struct inode *dir) { struct ceph_client *cl = mdsc->fsc->client; int ret = 0; req->r_tid = ++mdsc->last_tid; if (req->r_num_caps) { ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation, req->r_num_caps); if (ret < 0) { pr_err_client(cl, "%p failed to reserve caps: %d\n", req, ret); /* set req->r_err to fail early from __do_request */ req->r_err = ret; return; } } doutc(cl, "%p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); insert_request(&mdsc->request_tree, req); req->r_cred = get_current_cred(); if (!req->r_mnt_idmap) req->r_mnt_idmap = &nop_mnt_idmap; if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) mdsc->oldest_tid = req->r_tid; if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); ihold(dir); req->r_unsafe_dir = dir; spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); spin_unlock(&ci->i_unsafe_lock); } } static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { doutc(mdsc->fsc->client, "%p tid %lld\n", req, req->r_tid); /* Never leave an unregistered request on an unsafe list! */ list_del_init(&req->r_unsafe_item); if (req->r_tid == mdsc->oldest_tid) { struct rb_node *p = rb_next(&req->r_node); mdsc->oldest_tid = 0; while (p) { struct ceph_mds_request *next_req = rb_entry(p, struct ceph_mds_request, r_node); if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { mdsc->oldest_tid = next_req->r_tid; break; } p = rb_next(p); } } erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); } if (req->r_target_inode && test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_target_item); spin_unlock(&ci->i_unsafe_lock); } if (req->r_unsafe_dir) { iput(req->r_unsafe_dir); req->r_unsafe_dir = NULL; } complete_all(&req->r_safe_completion); ceph_mdsc_put_request(req); } /* * Walk back up the dentry tree until we hit a dentry representing a * non-snapshot inode. We do this using the rcu_read_lock (which must be held * when calling this) to ensure that the objects won't disappear while we're * working with them. Once we hit a candidate dentry, we attempt to take a * reference to it, and return that as the result. */ static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode *inode = NULL; while (dentry && !IS_ROOT(dentry)) { inode = d_inode_rcu(dentry); if (!inode || ceph_snap(inode) == CEPH_NOSNAP) break; dentry = dentry->d_parent; } if (inode) inode = igrab(inode); return inode; } /* * Choose mds to send request to next. If there is a hint set in the * request (e.g., due to a prior forward hint from the mds), use that. * Otherwise, consult frag tree and/or caps to identify the * appropriate mds. If all else fails, choose randomly. * * Called under mdsc->mutex. */ static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, bool *random) { struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; int mode = req->r_direct_mode; int mds = -1; u32 hash = req->r_direct_hash; bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); struct ceph_client *cl = mdsc->fsc->client; if (random) *random = false; /* * is there a specific mds we should try? ignore hint if we have * no session and the mds is not up (active or recovering). */ if (req->r_resend_mds >= 0 && (__have_session(mdsc, req->r_resend_mds) || ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { doutc(cl, "using resend_mds mds%d\n", req->r_resend_mds); return req->r_resend_mds; } if (mode == USE_RANDOM_MDS) goto random; inode = NULL; if (req->r_inode) { if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { inode = req->r_inode; ihold(inode); } else { /* req->r_dentry is non-null for LSSNAP request */ rcu_read_lock(); inode = get_nonsnap_parent(req->r_dentry); rcu_read_unlock(); doutc(cl, "using snapdir's parent %p %llx.%llx\n", inode, ceph_vinop(inode)); } } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; rcu_read_lock(); parent = READ_ONCE(req->r_dentry->d_parent); dir = req->r_parent ? : d_inode_rcu(parent); if (!dir || dir->i_sb != mdsc->fsc->sb) { /* not this fs or parent went negative */ inode = d_inode(req->r_dentry); if (inode) ihold(inode); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ inode = get_nonsnap_parent(parent); doutc(cl, "using nonsnap parent %p %llx.%llx\n", inode, ceph_vinop(inode)); } else { /* dentry target */ inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ inode = igrab(dir); hash = ceph_dentry_hash(dir, req->r_dentry); is_hash = true; } else { ihold(inode); } } rcu_read_unlock(); } if (!inode) goto random; doutc(cl, "%p %llx.%llx is_hash=%d (0x%x) mode %d\n", inode, ceph_vinop(inode), (int)is_hash, hash, mode); ci = ceph_inode(inode); if (is_hash && S_ISDIR(inode->i_mode)) { struct ceph_inode_frag frag; int found; ceph_choose_frag(ci, hash, &frag, &found); if (found) { if (mode == USE_ANY_MDS && frag.ndist > 0) { u8 r; /* choose a random replica */ get_random_bytes(&r, 1); r %= frag.ndist; mds = frag.dist[r]; doutc(cl, "%p %llx.%llx frag %u mds%d (%d/%d)\n", inode, ceph_vinop(inode), frag.frag, mds, (int)r, frag.ndist); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE && !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } /* since this file/dir wasn't known to be * replicated, then we want to look for the * authoritative mds. */ if (frag.mds >= 0) { /* choose auth mds */ mds = frag.mds; doutc(cl, "%p %llx.%llx frag %u mds%d (auth)\n", inode, ceph_vinop(inode), frag.frag, mds); if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= CEPH_MDS_STATE_ACTIVE) { if (!ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) goto out; } } mode = USE_AUTH_MDS; } } spin_lock(&ci->i_ceph_lock); cap = NULL; if (mode == USE_AUTH_MDS) cap = ci->i_auth_cap; if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { spin_unlock(&ci->i_ceph_lock); iput(inode); goto random; } mds = cap->session->s_mds; doutc(cl, "%p %llx.%llx mds%d (%scap %p)\n", inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); spin_unlock(&ci->i_ceph_lock); out: iput(inode); return mds; random: if (random) *random = true; mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); doutc(cl, "chose random mds%d\n", mds); return mds; } /* * session messages */ struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, false); if (!msg) { pr_err("ENOMEM creating session %s msg\n", ceph_session_op_name(op)); return NULL; } h = msg->front.iov_base; h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); return msg; } static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) static int encode_supported_features(void **p, void *end) { static const size_t count = ARRAY_SIZE(feature_bits); if (count > 0) { size_t i; size_t size = FEATURE_BYTES(count); unsigned long bit; if (WARN_ON_ONCE(*p + 4 + size > end)) return -ERANGE; ceph_encode_32(p, size); memset(*p, 0, size); for (i = 0; i < count; i++) { bit = feature_bits[i]; ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); } *p += size; } else { if (WARN_ON_ONCE(*p + 4 > end)) return -ERANGE; ceph_encode_32(p, 0); } return 0; } static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED; #define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8) static int encode_metric_spec(void **p, void *end) { static const size_t count = ARRAY_SIZE(metric_bits); /* header */ if (WARN_ON_ONCE(*p + 2 > end)) return -ERANGE; ceph_encode_8(p, 1); /* version */ ceph_encode_8(p, 1); /* compat */ if (count > 0) { size_t i; size_t size = METRIC_BYTES(count); if (WARN_ON_ONCE(*p + 4 + 4 + size > end)) return -ERANGE; /* metric spec info length */ ceph_encode_32(p, 4 + size); /* metric spec */ ceph_encode_32(p, size); memset(*p, 0, size); for (i = 0; i < count; i++) ((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8); *p += size; } else { if (WARN_ON_ONCE(*p + 4 + 4 > end)) return -ERANGE; /* metric spec info length */ ceph_encode_32(p, 4); /* metric spec */ ceph_encode_32(p, 0); } return 0; } /* * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. */ static struct ceph_msg * create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; int i; int extra_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; struct ceph_client *cl = mdsc->fsc->client; size_t size, count; void *p, *end; int ret; const char* metadata[][2] = { {"hostname", mdsc->nodename}, {"kernel_version", init_utsname()->release}, {"entity_id", opt->name ? : ""}, {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; /* Calculate serialized length of metadata */ extra_bytes = 4; /* map length */ for (i = 0; metadata[i][0]; ++i) { extra_bytes += 8 + strlen(metadata[i][0]) + strlen(metadata[i][1]); metadata_key_count++; } /* supported feature */ size = 0; count = ARRAY_SIZE(feature_bits); if (count > 0) size = FEATURE_BYTES(count); extra_bytes += 4 + size; /* metric spec */ size = 0; count = ARRAY_SIZE(metric_bits); if (count > 0) size = METRIC_BYTES(count); extra_bytes += 2 + 4 + 4 + size; /* flags, mds auth caps and oldest_client_tid */ extra_bytes += 4 + 4 + 8; /* Allocate the message */ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); if (!msg) { pr_err_client(cl, "ENOMEM creating session open msg\n"); return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; end = p + msg->front.iov_len; h = p; h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> * * ClientSession messages with metadata are v7 */ msg->hdr.version = cpu_to_le16(7); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ p += sizeof(*h); /* Number of entries in the map */ ceph_encode_32(&p, metadata_key_count); /* Two length-prefixed strings for each entry in the map */ for (i = 0; metadata[i][0]; ++i) { size_t const key_len = strlen(metadata[i][0]); size_t const val_len = strlen(metadata[i][1]); ceph_encode_32(&p, key_len); memcpy(p, metadata[i][0], key_len); p += key_len; ceph_encode_32(&p, val_len); memcpy(p, metadata[i][1], val_len); p += val_len; } ret = encode_supported_features(&p, end); if (ret) { pr_err_client(cl, "encode_supported_features failed!\n"); ceph_msg_put(msg); return ERR_PTR(ret); } ret = encode_metric_spec(&p, end); if (ret) { pr_err_client(cl, "encode_metric_spec failed!\n"); ceph_msg_put(msg); return ERR_PTR(ret); } /* version == 5, flags */ ceph_encode_32(&p, 0); /* version == 6, mds auth caps */ ceph_encode_32(&p, 0); /* version == 7, oldest_client_tid */ ceph_encode_64(&p, mdsc->oldest_tid); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return msg; } /* * send session open request. * * called under mdsc->mutex */ static int __open_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *msg; int mstate; int mds = session->s_mds; if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) return -EIO; /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); doutc(mdsc->fsc->client, "open_session to mds%d (%s)\n", mds, ceph_mds_state_name(mstate)); session->s_state = CEPH_MDS_SESSION_OPENING; session->s_renew_requested = jiffies; /* send connect message */ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN, session->s_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } /* * open sessions for any export targets for the given mds * * called under mdsc->mutex */ static struct ceph_mds_session * __open_export_target_session(struct ceph_mds_client *mdsc, int target) { struct ceph_mds_session *session; int ret; session = __ceph_lookup_mds_session(mdsc, target); if (!session) { session = register_session(mdsc, target); if (IS_ERR(session)) return session; } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { ret = __open_session(mdsc, session); if (ret) return ERR_PTR(ret); } return session; } struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) { struct ceph_mds_session *session; struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "to mds%d\n", target); mutex_lock(&mdsc->mutex); session = __open_export_target_session(mdsc, target); mutex_unlock(&mdsc->mutex); return session; } static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; struct ceph_client *cl = mdsc->fsc->client; if (mds >= mdsc->mdsmap->possible_max_rank) return; mi = &mdsc->mdsmap->m_info[mds]; doutc(cl, "for mds%d (%d targets)\n", session->s_mds, mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { ts = __open_export_target_session(mdsc, mi->export_targets[i]); ceph_put_mds_session(ts); } } /* * session caps */ static void detach_cap_releases(struct ceph_mds_session *session, struct list_head *target) { struct ceph_client *cl = session->s_mdsc->fsc->client; lockdep_assert_held(&session->s_cap_lock); list_splice_init(&session->s_cap_releases, target); session->s_num_cap_releases = 0; doutc(cl, "mds%d\n", session->s_mds); } static void dispose_cap_releases(struct ceph_mds_client *mdsc, struct list_head *dispose) { while (!list_empty(dispose)) { struct ceph_cap *cap; /* zero out the in-progress message */ cap = list_first_entry(dispose, struct ceph_cap, session_caps); list_del(&cap->session_caps); ceph_put_cap(mdsc, cap); } } static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct rb_node *p; doutc(cl, "mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); while (!list_empty(&session->s_unsafe)) { req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited_client(cl, " dropping unsafe request %llu\n", req->r_tid); if (req->r_target_inode) mapping_set_error(req->r_target_inode->i_mapping, -EIO); if (req->r_unsafe_dir) mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ p = rb_first(&mdsc->request_tree); while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); if (req->r_session && req->r_session->s_mds == session->s_mds) req->r_attempts = 0; } mutex_unlock(&mdsc->mutex); } /* * Helper to safely iterate over all caps associated with a session, with * special care taken to handle a racing __ceph_remove_cap(). * * Caller must hold session s_mutex. */ int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg) { struct ceph_client *cl = session->s_mdsc->fsc->client; struct list_head *p; struct ceph_cap *cap; struct inode *inode, *last_inode = NULL; struct ceph_cap *old_cap = NULL; int ret; doutc(cl, "%p mds%d\n", session, session->s_mds); spin_lock(&session->s_cap_lock); p = session->s_caps.next; while (p != &session->s_caps) { int mds; cap = list_entry(p, struct ceph_cap, session_caps); inode = igrab(&cap->ci->netfs.inode); if (!inode) { p = p->next; continue; } session->s_cap_iterator = cap; mds = cap->mds; spin_unlock(&session->s_cap_lock); if (last_inode) { iput(last_inode); last_inode = NULL; } if (old_cap) { ceph_put_cap(session->s_mdsc, old_cap); old_cap = NULL; } ret = cb(inode, mds, arg); last_inode = inode; spin_lock(&session->s_cap_lock); p = p->next; if (!cap->ci) { doutc(cl, "finishing cap %p removal\n", cap); BUG_ON(cap->session != session); cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; atomic64_dec(&session->s_mdsc->metric.total_caps); if (cap->queue_release) __ceph_queue_cap_release(session, cap); else old_cap = cap; /* put_cap it w/o locks held */ } if (ret < 0) goto out; } ret = 0; out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); return ret; } static int remove_session_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); bool invalidate = false; struct ceph_cap *cap; int iputs = 0; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (cap) { doutc(cl, " removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->netfs.inode); iputs = ceph_purge_inode_cap(inode, cap, &invalidate); } spin_unlock(&ci->i_ceph_lock); if (cap) wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); while (iputs--) iput(inode); return 0; } /* * caller must hold session s_mutex */ static void remove_session_caps(struct ceph_mds_session *session) { struct ceph_fs_client *fsc = session->s_mdsc->fsc; struct super_block *sb = fsc->sb; LIST_HEAD(dispose); doutc(fsc->client, "on %p\n", session); ceph_iterate_session_caps(session, remove_session_caps_cb, fsc); wake_up_all(&fsc->mdsc->cap_flushing_wq); spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { struct inode *inode; struct ceph_cap *cap, *prev = NULL; struct ceph_vino vino; /* * iterate_session_caps() skips inodes that are being * deleted, we need to wait until deletions are complete. * __wait_on_freeing_inode() is designed for the job, * but it is not exported, so use lookup inode function * to access it. */ while (!list_empty(&session->s_caps)) { cap = list_entry(session->s_caps.next, struct ceph_cap, session_caps); if (cap == prev) break; prev = cap; vino = cap->ci->i_vino; spin_unlock(&session->s_cap_lock); inode = ceph_find_inode(sb, vino); iput(inode); spin_lock(&session->s_cap_lock); } } // drop cap expires and unlock s_cap_lock detach_cap_releases(session, &dispose); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); spin_unlock(&session->s_cap_lock); dispose_cap_releases(session->s_mdsc, &dispose); } enum { RECONNECT, RENEWCAPS, FORCE_RO, }; /* * wake up any threads waiting on this session's caps. if the cap is * old (didn't get renewed on the client reconnect), remove it now. * * caller must hold s_mutex. */ static int wake_up_session_cb(struct inode *inode, int mds, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned long ev = (unsigned long)arg; if (ev == RECONNECT) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } else if (ev == RENEWCAPS) { struct ceph_cap *cap; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); /* mds did not re-issue stale cap */ if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen)) cap->issued = cap->implemented = CEPH_CAP_PIN; spin_unlock(&ci->i_ceph_lock); } else if (ev == FORCE_RO) { } wake_up_all(&ci->i_cap_wq); return 0; } static void wake_up_session_caps(struct ceph_mds_session *session, int ev) { struct ceph_client *cl = session->s_mdsc->fsc->client; doutc(cl, "session %p mds%d\n", session, session->s_mds); ceph_iterate_session_caps(session, wake_up_session_cb, (void *)(unsigned long)ev); } /* * Send periodic message to MDS renewing all currently held caps. The * ack will reset the expiration for all caps from this session. * * caller holds s_mutex */ static int send_renew_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; int state; if (time_after_eq(jiffies, session->s_cap_ttl) && time_after_eq(session->s_cap_ttl, session->s_renew_requested)) pr_info_client(cl, "mds%d caps stale\n", session->s_mds); session->s_renew_requested = jiffies; /* do not try to renew caps until a recovering mds has reconnected * with its clients. */ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); if (state < CEPH_MDS_STATE_RECONNECT) { doutc(cl, "ignoring mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); return 0; } doutc(cl, "to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); if (IS_ERR(msg)) return PTR_ERR(msg); ceph_con_send(&session->s_con, msg); return 0; } static int send_flushmsg_ack(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, u64 seq) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; doutc(cl, "to mds%d (%s)s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), seq); msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); return 0; } /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * * Called under session->s_mutex */ static void renewed_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int is_renew) { struct ceph_client *cl = mdsc->fsc->client; int was_stale; int wake = 0; spin_lock(&session->s_cap_lock); was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); session->s_cap_ttl = session->s_renew_requested + mdsc->mdsmap->m_session_timeout*HZ; if (was_stale) { if (time_before(jiffies, session->s_cap_ttl)) { pr_info_client(cl, "mds%d caps renewed\n", session->s_mds); wake = 1; } else { pr_info_client(cl, "mds%d caps still stale\n", session->s_mds); } } doutc(cl, "mds%d ttl now %lu, was %s, now %s\n", session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); spin_unlock(&session->s_cap_lock); if (wake) wake_up_session_caps(session, RENEWCAPS); } /* * send a session close request */ static int request_close_session(struct ceph_mds_session *session) { struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; doutc(cl, "mds%d state %s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), session->s_seq); msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); return 1; } /* * Called with s_mutex held. */ static int __close_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { if (session->s_state >= CEPH_MDS_SESSION_CLOSING) return 0; session->s_state = CEPH_MDS_SESSION_CLOSING; return request_close_session(session); } static bool drop_negative_children(struct dentry *dentry) { struct dentry *child; bool all_negative = true; if (!d_is_dir(dentry)) goto out; spin_lock(&dentry->d_lock); hlist_for_each_entry(child, &dentry->d_children, d_sib) { if (d_really_is_positive(child)) { all_negative = false; break; } } spin_unlock(&dentry->d_lock); if (all_negative) shrink_dcache_parent(dentry); out: return all_negative; } /* * Trim old(er) caps. * * Because we can't cache an inode without one or more caps, we do * this indirectly: if a cap is unused, we prune its aliases, at which * point the inode will hopefully get dropped to. * * Yes, this is a bit sloppy. Our only real goal here is to respond to * memory pressure from the MDS, though, so it needn't be perfect. */ static int trim_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = mdsc->fsc->client; int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; struct ceph_cap *cap; if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { spin_unlock(&ci->i_ceph_lock); return 0; } mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); doutc(cl, "%p %llx.%llx cap %p mine %s oissued %s used %s wanted %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(mine), ceph_cap_string(oissued), ceph_cap_string(used), ceph_cap_string(wanted)); if (cap == ci->i_auth_cap) { if (ci->i_dirty_caps || ci->i_flushing_caps || !list_empty(&ci->i_cap_snaps)) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; /* Note: it's possible that i_filelock_ref becomes non-zero * after dropping auth caps. It doesn't hurt because reply * of lock mds request will re-add auth caps. */ if (atomic_read(&ci->i_filelock_ref) > 0) goto out; } /* The inode has cached pages, but it's no longer used. * we can safely drop it */ if (S_ISREG(inode->i_mode) && wanted == 0 && used == CEPH_CAP_FILE_CACHE && !(oissued & CEPH_CAP_FILE_CACHE)) { used = 0; oissued = 0; } if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ if (oissued) { /* we aren't the only cap.. just remove us */ ceph_remove_cap(mdsc, cap, true); (*remaining)--; } else { struct dentry *dentry; /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); dentry = d_find_any_alias(inode); if (dentry && drop_negative_children(dentry)) { int count; dput(dentry); d_prune_aliases(inode); count = atomic_read(&inode->i_count); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", inode, ceph_vinop(inode), cap, count); } else { dput(dentry); } return 0; } out: spin_unlock(&ci->i_ceph_lock); return 0; } /* * Trim session cap count down to some max number. */ int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int max_caps) { struct ceph_client *cl = mdsc->fsc->client; int trim_caps = session->s_nr_caps - max_caps; doutc(cl, "mds%d start: %d / %d, trim %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { int remaining = trim_caps; ceph_iterate_session_caps(session, trim_caps_cb, &remaining); doutc(cl, "mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps - remaining); } ceph_flush_session_cap_releases(mdsc, session); return 0; } static int check_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; int ret = 1; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_first_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); if (cf->tid <= want_flush_tid) { doutc(cl, "still flushing tid %llu <= %llu\n", cf->tid, want_flush_tid); ret = 0; } } spin_unlock(&mdsc->cap_dirty_lock); return ret; } /* * flush all dirty inode data to disk. * * returns true if we've flushed through want_flush_tid */ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "want %llu\n", want_flush_tid); wait_event(mdsc->cap_flushing_wq, check_caps_flush(mdsc, want_flush_tid)); doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); } /* * called under s_mutex */ static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; struct ceph_cap *cap; LIST_HEAD(tmp_list); int num_cap_releases; __le32 barrier, *cap_barrier; down_read(&osdc->lock); barrier = cpu_to_le32(osdc->epoch_barrier); up_read(&osdc->lock); spin_lock(&session->s_cap_lock); again: list_splice_init(&session->s_cap_releases, &tmp_list); num_cap_releases = session->s_num_cap_releases; session->s_num_cap_releases = 0; spin_unlock(&session->s_cap_lock); while (!list_empty(&tmp_list)) { if (!msg) { msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_SIZE, GFP_NOFS, false); if (!msg) goto out_err; head = msg->front.iov_base; head->num = cpu_to_le32(0); msg->front.iov_len = sizeof(*head); msg->hdr.version = cpu_to_le16(2); msg->hdr.compat_version = cpu_to_le16(1); } cap = list_first_entry(&tmp_list, struct ceph_cap, session_caps); list_del(&cap->session_caps); num_cap_releases--; head = msg->front.iov_base; put_unaligned_le32(get_unaligned_le32(&head->num) + 1, &head->num); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(cap->cap_ino); item->cap_id = cpu_to_le64(cap->cap_id); item->migrate_seq = cpu_to_le32(cap->mseq); item->issue_seq = cpu_to_le32(cap->issue_seq); msg->front.iov_len += sizeof(*item); ceph_put_cap(mdsc, cap); if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { // Append cap_barrier field cap_barrier = msg->front.iov_base + msg->front.iov_len; *cap_barrier = barrier; msg->front.iov_len += sizeof(*cap_barrier); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); doutc(cl, "mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); msg = NULL; } } BUG_ON(num_cap_releases != 0); spin_lock(&session->s_cap_lock); if (!list_empty(&session->s_cap_releases)) goto again; spin_unlock(&session->s_cap_lock); if (msg) { // Append cap_barrier field cap_barrier = msg->front.iov_base + msg->front.iov_len; *cap_barrier = barrier; msg->front.iov_len += sizeof(*cap_barrier); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); doutc(cl, "mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); } return; out_err: pr_err_client(cl, "mds%d, failed to allocate message\n", session->s_mds); spin_lock(&session->s_cap_lock); list_splice(&tmp_list, &session->s_cap_releases); session->s_num_cap_releases += num_cap_releases; spin_unlock(&session->s_cap_lock); } static void ceph_cap_release_work(struct work_struct *work) { struct ceph_mds_session *session = container_of(work, struct ceph_mds_session, s_cap_release_work); mutex_lock(&session->s_mutex); if (session->s_state == CEPH_MDS_SESSION_OPEN || session->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(session->s_mdsc, session); mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); } void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; ceph_get_mds_session(session); if (queue_work(mdsc->fsc->cap_wq, &session->s_cap_release_work)) { doutc(cl, "cap release work queued\n"); } else { ceph_put_mds_session(session); doutc(cl, "failed to queue cap release work\n"); } } /* * caller holds session->s_cap_lock */ void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap) { list_add_tail(&cap->session_caps, &session->s_cap_releases); session->s_num_cap_releases++; if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) ceph_flush_session_cap_releases(session->s_mdsc, session); } static void ceph_cap_reclaim_work(struct work_struct *work) { struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, cap_reclaim_work); int ret = ceph_trim_dentries(mdsc); if (ret == -EAGAIN) ceph_queue_cap_reclaim_work(mdsc); } void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { doutc(cl, "caps reclaim work queued\n"); } else { doutc(cl, "failed to queue caps release work\n"); } } void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) { int val; if (!nr) return; val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); if ((val % CEPH_CAPS_PER_RELEASE) < nr) { atomic_set(&mdsc->cap_reclaim_pending, 0); ceph_queue_cap_reclaim_work(mdsc); } } void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; if (mdsc->stopping) return; if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) { doutc(cl, "caps unlink work queued\n"); } else { doutc(cl, "failed to queue caps unlink work\n"); } } static void ceph_cap_unlink_work(struct work_struct *work) { struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, cap_unlink_work); struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_unlink_delay_list)) { struct ceph_inode_info *ci; struct inode *inode; ci = list_first_entry(&mdsc->cap_unlink_delay_list, struct ceph_inode_info, i_cap_delay_list); list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_delay_lock); } } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); } /* * requests */ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); unsigned int num_entries; int order; spin_lock(&ci->i_ceph_lock); num_entries = ci->i_files + ci->i_subdirs; spin_unlock(&ci->i_ceph_lock); num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); order = get_order(size * num_entries); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, order); if (rinfo->dir_entries) break; order--; } if (!rinfo->dir_entries) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; num_entries = min(num_entries, opt->max_readdir); rinfo->dir_buf_size = PAGE_SIZE << order; req->r_num_caps = num_entries + 1; req->r_args.readdir.max_entries = cpu_to_le32(num_entries); req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); return 0; } /* * Create an mds request. */ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req; req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); mutex_init(&req->r_fill_mutex); req->r_mdsc = mdsc; req->r_started = jiffies; req->r_start_latency = ktime_get(); req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; req->r_feature_needed = -1; kref_init(&req->r_kref); RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); ktime_get_coarse_real_ts64(&req->r_stamp); req->r_op = op; req->r_direct_mode = mode; return req; } /* * return oldest (lowest) request, tid in request tree, 0 if none. * * called under mdsc->mutex. */ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) { if (RB_EMPTY_ROOT(&mdsc->request_tree)) return NULL; return rb_entry(rb_first(&mdsc->request_tree), struct ceph_mds_request, r_node); } static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { return mdsc->oldest_tid; } #if IS_ENABLED(CONFIG_FS_ENCRYPTION) static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) { struct inode *dir = req->r_parent; struct dentry *dentry = req->r_dentry; const struct qstr *name = req->r_dname; u8 *cryptbuf = NULL; u32 len = 0; int ret = 0; /* only encode if we have parent and dentry */ if (!dir || !dentry) goto success; /* No-op unless this is encrypted */ if (!IS_ENCRYPTED(dir)) goto success; ret = ceph_fscrypt_prepare_readdir(dir); if (ret < 0) return ERR_PTR(ret); /* No key? Just ignore it. */ if (!fscrypt_has_encryption_key(dir)) goto success; if (!name) name = &dentry->d_name; if (!fscrypt_fname_encrypted_size(dir, name->len, NAME_MAX, &len)) { WARN_ON_ONCE(1); return ERR_PTR(-ENAMETOOLONG); } /* No need to append altname if name is short enough */ if (len <= CEPH_NOHASH_NAME_MAX) { len = 0; goto success; } cryptbuf = kmalloc(len, GFP_KERNEL); if (!cryptbuf) return ERR_PTR(-ENOMEM); ret = fscrypt_fname_encrypt(dir, name, cryptbuf, len); if (ret) { kfree(cryptbuf); return ERR_PTR(ret); } success: *plen = len; return cryptbuf; } #else static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) { *plen = 0; return NULL; } #endif /** * ceph_mdsc_build_path - build a path string to a given dentry * @mdsc: mds client * @dentry: dentry to which path should be built * @plen: returned length of string * @pbase: returned base inode number * @for_wire: is this path going to be sent to the MDS? * * Build a string that represents the path to the dentry. This is mostly called * for two different purposes: * * 1) we need to build a path string to send to the MDS (for_wire == true) * 2) we need a path string for local presentation (e.g. debugfs) * (for_wire == false) * * The path is built in reverse, starting with the dentry. Walk back up toward * the root, building the path until the first non-snapped inode is reached * (for_wire) or the root inode is reached (!for_wire). * * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, int *plen, u64 *pbase, int for_wire) { struct ceph_client *cl = mdsc->fsc->client; struct dentry *cur; struct inode *inode; char *path; int pos; unsigned seq; u64 base; if (!dentry) return ERR_PTR(-EINVAL); path = __getname(); if (!path) return ERR_PTR(-ENOMEM); retry: pos = PATH_MAX - 1; path[pos] = '\0'; seq = read_seqbegin(&rename_lock); cur = dget(dentry); for (;;) { struct dentry *parent; spin_lock(&cur->d_lock); inode = d_inode(cur); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { doutc(cl, "path+%d: %p SNAPDIR\n", pos, cur); spin_unlock(&cur->d_lock); parent = dget_parent(cur); } else if (for_wire && inode && dentry != cur && ceph_snap(inode) == CEPH_NOSNAP) { spin_unlock(&cur->d_lock); pos++; /* get rid of any prepended '/' */ break; } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) { pos -= cur->d_name.len; if (pos < 0) { spin_unlock(&cur->d_lock); break; } memcpy(path + pos, cur->d_name.name, cur->d_name.len); spin_unlock(&cur->d_lock); parent = dget_parent(cur); } else { int len, ret; char buf[NAME_MAX]; /* * Proactively copy name into buf, in case we need to * present it as-is. */ memcpy(buf, cur->d_name.name, cur->d_name.len); len = cur->d_name.len; spin_unlock(&cur->d_lock); parent = dget_parent(cur); ret = ceph_fscrypt_prepare_readdir(d_inode(parent)); if (ret < 0) { dput(parent); dput(cur); return ERR_PTR(ret); } if (fscrypt_has_encryption_key(d_inode(parent))) { len = ceph_encode_encrypted_fname(d_inode(parent), cur, buf); if (len < 0) { dput(parent); dput(cur); return ERR_PTR(len); } } pos -= len; if (pos < 0) { dput(parent); break; } memcpy(path + pos, buf, len); } dput(cur); cur = parent; /* Are we at the root? */ if (IS_ROOT(cur)) break; /* Are we out of buffer? */ if (--pos < 0) break; path[pos] = '/'; } inode = d_inode(cur); base = inode ? ceph_ino(inode) : 0; dput(cur); if (read_seqretry(&rename_lock, seq)) goto retry; if (pos < 0) { /* * The path is longer than PATH_MAX and this function * cannot ever succeed. Creating paths that long is * possible with Ceph, but Linux cannot use them. */ return ERR_PTR(-ENAMETOOLONG); } *pbase = base; *plen = PATH_MAX - 1 - pos; doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), base, *plen, path + pos); return path + pos; } static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, bool *pfreepath, bool parent_locked) { char *path; rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && !IS_ENCRYPTED(dir)) { *pino = ceph_ino(dir); rcu_read_unlock(); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; } rcu_read_unlock(); path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, bool *pfreepath) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct dentry *dentry; char *path; if (ceph_snap(inode) == CEPH_NOSNAP) { *pino = ceph_ino(inode); *ppathlen = 0; return 0; } dentry = d_find_alias(inode); path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1); dput(dentry); if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; *pfreepath = true; return 0; } /* * request arguments may be specified via an inode *, a dentry *, or * an explicit ino+path. */ static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, u64 *ino, bool *freepath, bool parent_locked) { struct ceph_client *cl = mdsc->fsc->client; int r = 0; if (rinode) { r = build_inode_path(rinode, ppath, pathlen, ino, freepath); doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode), ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino, freepath, parent_locked); doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { *ino = rino; *ppath = rpath; *pathlen = rpath ? strlen(rpath) : 0; doutc(cl, " path %.*s\n", *pathlen, rpath); } return r; } static void encode_mclientrequest_tail(void **p, const struct ceph_mds_request *req) { struct ceph_timespec ts; int i; ceph_encode_timespec64(&ts, &req->r_stamp); ceph_encode_copy(p, &ts, sizeof(ts)); /* v4: gid_list */ ceph_encode_32(p, req->r_cred->group_info->ngroups); for (i = 0; i < req->r_cred->group_info->ngroups; i++) ceph_encode_64(p, from_kgid(&init_user_ns, req->r_cred->group_info->gid[i])); /* v5: altname */ ceph_encode_32(p, req->r_altname_len); ceph_encode_copy(p, req->r_altname, req->r_altname_len); /* v6: fscrypt_auth and fscrypt_file */ if (req->r_fscrypt_auth) { u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth); ceph_encode_32(p, authlen); ceph_encode_copy(p, req->r_fscrypt_auth, authlen); } else { ceph_encode_32(p, 0); } if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) { ceph_encode_32(p, sizeof(__le64)); ceph_encode_64(p, req->r_fscrypt_file); } else { ceph_encode_32(p, 0); } } static inline u16 mds_supported_head_version(struct ceph_mds_session *session) { if (!test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features)) return 1; if (!test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) return 2; return CEPH_MDS_REQUEST_HEAD_VERSION; } static struct ceph_mds_request_head_legacy * find_legacy_request_head(void *p, u64 features) { bool legacy = !(features & CEPH_FEATURE_FS_BTIME); struct ceph_mds_request_head *head; if (legacy) return (struct ceph_mds_request_head_legacy *)p; head = (struct ceph_mds_request_head *)p; return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid; } /* * called under mdsc->mutex */ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_request_head_legacy *lhead; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; struct dentry *old_dentry = NULL; int len; u16 releases; void *p, *end; int ret; bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); u16 request_head_version = mds_supported_head_version(session); kuid_t caller_fsuid = req->r_cred->fsuid; kgid_t caller_fsgid = req->r_cred->fsgid; ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, &path1, &pathlen1, &ino1, &freepath1, test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } /* If r_old_dentry is set, then assume that its parent is locked */ if (req->r_old_dentry && !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED)) old_dentry = req->r_old_dentry; ret = set_request_path_attr(mdsc, NULL, old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; } req->r_altname = get_fscrypt_altname(req, &req->r_altname_len); if (IS_ERR(req->r_altname)) { msg = ERR_CAST(req->r_altname); req->r_altname = NULL; goto out_free2; } /* * For old cephs without supporting the 32bit retry/fwd feature * it will copy the raw memories directly when decoding the * requests. While new cephs will decode the head depending the * version member, so we need to make sure it will be compatible * with them both. */ if (legacy) len = sizeof(struct ceph_mds_request_head_legacy); else if (request_head_version == 1) len = offsetofend(struct ceph_mds_request_head, args); else if (request_head_version == 2) len = offsetofend(struct ceph_mds_request_head, ext_num_fwd); else len = sizeof(struct ceph_mds_request_head); /* filepaths */ len += 2 * (1 + sizeof(u32) + sizeof(u64)); len += pathlen1 + pathlen2; /* cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); if (req->r_dentry_drop) len += pathlen1; if (req->r_old_dentry_drop) len += pathlen2; /* MClientRequest tail */ /* req->r_stamp */ len += sizeof(struct ceph_timespec); /* gid list */ len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); /* alternate name */ len += sizeof(u32) + req->r_altname_len; /* fscrypt_auth */ len += sizeof(u32); // fscrypt_auth if (req->r_fscrypt_auth) len += ceph_fscrypt_auth_len(req->r_fscrypt_auth); /* fscrypt_file */ len += sizeof(u32); if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) len += sizeof(__le64); msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; } msg->hdr.tid = cpu_to_le64(req->r_tid); lhead = find_legacy_request_head(msg->front.iov_base, session->s_con.peer_features); if ((req->r_mnt_idmap != &nop_mnt_idmap) && !test_bit(CEPHFS_FEATURE_HAS_OWNER_UIDGID, &session->s_features)) { WARN_ON_ONCE(!IS_CEPH_MDS_OP_NEWINODE(req->r_op)); if (enable_unsafe_idmap) { pr_warn_once_client(cl, "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" " is not supported by MDS. UID/GID-based restrictions may" " not work properly.\n"); caller_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, VFSUIDT_INIT(req->r_cred->fsuid)); caller_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, VFSGIDT_INIT(req->r_cred->fsgid)); } else { pr_err_ratelimited_client(cl, "idmapped mount is used and CEPHFS_FEATURE_HAS_OWNER_UIDGID" " is not supported by MDS. Fail request with -EIO.\n"); ret = -EIO; goto out_err; } } /* * The ceph_mds_request_head_legacy didn't contain a version field, and * one was added when we moved the message version from 3->4. */ if (legacy) { msg->hdr.version = cpu_to_le16(3); p = msg->front.iov_base + sizeof(*lhead); } else if (request_head_version == 1) { struct ceph_mds_request_head *nhead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(4); nhead->version = cpu_to_le16(1); p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args); } else if (request_head_version == 2) { struct ceph_mds_request_head *nhead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(6); nhead->version = cpu_to_le16(2); p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, ext_num_fwd); } else { struct ceph_mds_request_head *nhead = msg->front.iov_base; kuid_t owner_fsuid; kgid_t owner_fsgid; msg->hdr.version = cpu_to_le16(6); nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); nhead->struct_len = cpu_to_le32(sizeof(struct ceph_mds_request_head)); if (IS_CEPH_MDS_OP_NEWINODE(req->r_op)) { owner_fsuid = from_vfsuid(req->r_mnt_idmap, &init_user_ns, VFSUIDT_INIT(req->r_cred->fsuid)); owner_fsgid = from_vfsgid(req->r_mnt_idmap, &init_user_ns, VFSGIDT_INIT(req->r_cred->fsgid)); nhead->owner_uid = cpu_to_le32(from_kuid(&init_user_ns, owner_fsuid)); nhead->owner_gid = cpu_to_le32(from_kgid(&init_user_ns, owner_fsgid)); } else { nhead->owner_uid = cpu_to_le32(-1); nhead->owner_gid = cpu_to_le32(-1); } p = msg->front.iov_base + sizeof(*nhead); } end = msg->front.iov_base + msg->front.iov_len; lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); lhead->op = cpu_to_le32(req->r_op); lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, caller_fsuid)); lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, caller_fsgid)); lhead->ino = cpu_to_le64(req->r_deleg_ino); lhead->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); /* make note of release offset, in case we need to replay */ req->r_request_release_offset = p - msg->front.iov_base; /* cap releases */ releases = 0; if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, req->r_op == CEPH_MDS_OP_READDIR); if (req->r_dentry_drop) { ret = ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, req->r_dentry_unless); if (ret < 0) goto out_err; releases += ret; } if (req->r_old_dentry_drop) { ret = ceph_encode_dentry_release(&p, req->r_old_dentry, req->r_old_dentry_dir, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); if (ret < 0) goto out_err; releases += ret; } if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); if (drop_cap_releases) { releases = 0; p = msg->front.iov_base + req->r_request_release_offset; } lhead->num_releases = cpu_to_le16(releases); encode_mclientrequest_tail(&p, req); if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); msg = ERR_PTR(-ERANGE); goto out_free2; } msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); if (req->r_pagelist) { struct ceph_pagelist *pagelist = req->r_pagelist; ceph_msg_data_add_pagelist(msg, pagelist); msg->hdr.data_len = cpu_to_le32(pagelist->length); } else { msg->hdr.data_len = 0; } msg->hdr.data_off = cpu_to_le16(0); out_free2: if (freepath2) ceph_mdsc_free_path((char *)path2, pathlen2); out_free1: if (freepath1) ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; out_err: ceph_msg_put(msg); msg = ERR_PTR(ret); goto out_free2; } /* * called under mdsc->mutex if error, under no mutex if * success. */ static void complete_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { req->r_end_latency = ktime_get(); if (req->r_callback) req->r_callback(mdsc, req); complete_all(&req->r_completion); } /* * called under mdsc->mutex */ static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request_head_legacy *lhead; struct ceph_mds_request_head *nhead; struct ceph_msg *msg; int flags = 0, old_max_retry; bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, &session->s_features); /* * Avoid infinite retrying after overflow. The client will * increase the retry count and if the MDS is old version, * so we limit to retry at most 256 times. */ if (req->r_attempts) { old_max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); if ((old_version && req->r_attempts >= old_max_retry) || ((uint32_t)req->r_attempts >= U32_MAX)) { pr_warn_ratelimited_client(cl, "request tid %llu seq overflow\n", req->r_tid); return -EMULTIHOP; } } req->r_attempts++; if (req->r_inode) { struct ceph_cap *cap = ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); if (cap) req->r_sent_on_mseq = cap->mseq; else req->r_sent_on_mseq = -1; } doutc(cl, "%p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; /* * Replay. Do not regenerate message (and rebuild * paths, etc.); just use the original message. * Rebuilding paths will break for renames because * d_move mangles the src name. */ msg = req->r_request; lhead = find_legacy_request_head(msg->front.iov_base, session->s_con.peer_features); flags = le32_to_cpu(lhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; lhead->flags = cpu_to_le32(flags); if (req->r_target_inode) lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); lhead->num_retry = req->r_attempts - 1; if (!old_version) { nhead = (struct ceph_mds_request_head*)msg->front.iov_base; nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); } /* remove cap/dentry releases from message */ lhead->num_releases = 0; p = msg->front.iov_base + req->r_request_release_offset; encode_mclientrequest_tail(&p, req); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); return 0; } if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; } msg = create_request_message(session, req, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); return PTR_ERR(msg); } req->r_request = msg; lhead = find_legacy_request_head(msg->front.iov_base, session->s_con.peer_features); lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; lhead->flags = cpu_to_le32(flags); lhead->num_fwd = req->r_num_fwd; lhead->num_retry = req->r_attempts - 1; if (!old_version) { nhead = (struct ceph_mds_request_head*)msg->front.iov_base; nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); } doutc(cl, " r_parent = %p\n", req->r_parent); return 0; } /* * called under mdsc->mutex */ static int __send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int err; err = __prepare_send_request(session, req, drop_cap_releases); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); } return err; } /* * send request, or put it on the appropriate wait list. */ static void __do_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session = NULL; int mds = -1; int err = 0; bool random; if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) __unregister_request(mdsc, req); return; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) { doutc(cl, "metadata corrupted\n"); err = -EIO; goto finish; } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { doutc(cl, "timed out\n"); err = -ETIMEDOUT; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { doutc(cl, "forced umount\n"); err = -EIO; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { if (mdsc->mdsmap_err) { err = mdsc->mdsmap_err; doutc(cl, "mdsmap err %d\n", err); goto finish; } if (mdsc->mdsmap->m_epoch == 0) { doutc(cl, "no mdsmap, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } if (!(mdsc->fsc->mount_options->flags & CEPH_MOUNT_OPT_MOUNTWAIT) && !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { err = -EHOSTUNREACH; goto finish; } } put_request_session(req); mds = __choose_mds(mdsc, req, &random); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { err = -EJUKEBOX; goto finish; } doutc(cl, "no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; } /* get, open session */ session = __ceph_lookup_mds_session(mdsc, mds); if (!session) { session = register_session(mdsc, mds); if (IS_ERR(session)) { err = PTR_ERR(session); goto finish; } } req->r_session = ceph_get_mds_session(session); doutc(cl, "mds%d session %p state %s\n", mds, session, ceph_session_state_name(session->s_state)); /* * The old ceph will crash the MDSs when see unknown OPs */ if (req->r_feature_needed > 0 && !test_bit(req->r_feature_needed, &session->s_features)) { err = -EOPNOTSUPP; goto out_session; } if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { /* * We cannot queue async requests since the caps and delegated * inodes are bound to the session. Just return -EJUKEBOX and * let the caller retry a sync request in that case. */ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { err = -EJUKEBOX; goto out_session; } /* * If the session has been REJECTED, then return a hard error, * unless it's a CLEANRECOVER mount, in which case we'll queue * it to the mdsc queue. */ if (session->s_state == CEPH_MDS_SESSION_REJECTED) { if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) list_add(&req->r_wait, &mdsc->waiting_for_map); else err = -EACCES; goto out_session; } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { err = __open_session(mdsc, session); if (err) goto out_session; /* retry the same mds later */ if (random) req->r_resend_mds = mds; } list_add(&req->r_wait, &session->s_waiting); goto out_session; } /* send request */ req->r_resend_mds = -1; /* forget any previous mds hint */ if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; /* * For async create we will choose the auth MDS of frag in parent * directory to send the request and usually this works fine, but * if the migrated the dirtory to another MDS before it could handle * it the request will be forwarded. * * And then the auth cap will be changed. */ if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); struct ceph_inode_info *ci; struct ceph_cap *cap; /* * The request maybe handled very fast and the new inode * hasn't been linked to the dentry yet. We need to wait * for the ceph_finish_async_create(), which shouldn't be * stuck too long or fail in thoery, to finish when forwarding * the request. */ if (!d_inode(req->r_dentry)) { err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, TASK_KILLABLE); if (err) { mutex_lock(&req->r_fill_mutex); set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); goto out_session; } } ci = ceph_inode(d_inode(req->r_dentry)); spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { doutc(cl, "session changed for auth cap %d -> %d\n", cap->session->s_mds, session->s_mds); /* Remove the auth cap from old session */ spin_lock(&cap->session->s_cap_lock); cap->session->s_nr_caps--; list_del_init(&cap->session_caps); spin_unlock(&cap->session->s_cap_lock); /* Add the auth cap to the new session */ cap->mds = mds; cap->session = session; spin_lock(&session->s_cap_lock); session->s_nr_caps++; list_add_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); change_auth_cap_ses(ci, session); } spin_unlock(&ci->i_ceph_lock); } err = __send_request(session, req, false); out_session: ceph_put_mds_session(session); finish: if (err) { doutc(cl, "early error %d\n", err); req->r_err = err; complete_request(mdsc, req); __unregister_request(mdsc, req); } return; } /* * called under mdsc->mutex */ static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; LIST_HEAD(tmp_list); list_splice_init(head, &tmp_list); while (!list_empty(&tmp_list)) { req = list_entry(tmp_list.next, struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); doutc(cl, " wake request %p tid %llu\n", req, req->r_tid); __do_request(mdsc, req); } } /* * Wake up threads with requests pending for @mds, so that they can * resubmit their requests to a possibly different mds. */ static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct rb_node *p = rb_first(&mdsc->request_tree); doutc(cl, "kick_requests mds%d\n", mds); while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts > 0) continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { doutc(cl, " kicking tid %llu\n", req->r_tid); list_del_init(&req->r_wait); __do_request(mdsc, req); } } } int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { struct ceph_client *cl = mdsc->fsc->client; int err = 0; /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_parent) { struct ceph_inode_info *ci = ceph_inode(req->r_parent); int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ? CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD; spin_lock(&ci->i_ceph_lock); ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); if (req->r_inode) { err = ceph_wait_on_async_create(req->r_inode); if (err) { doutc(cl, "wait for async create returned: %d\n", err); return err; } } if (!err && req->r_old_inode) { err = ceph_wait_on_async_create(req->r_old_inode); if (err) { doutc(cl, "wait for async create returned: %d\n", err); return err; } } doutc(cl, "submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); __do_request(mdsc, req); err = req->r_err; mutex_unlock(&mdsc->mutex); return err; } int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, ceph_mds_request_wait_callback_t wait_func) { struct ceph_client *cl = mdsc->fsc->client; int err; /* wait */ doutc(cl, "do_request waiting\n"); if (wait_func) { err = wait_func(mdsc, req); } else { long timeleft = wait_for_completion_killable_timeout( &req->r_completion, ceph_timeout_jiffies(req->r_timeout)); if (timeleft > 0) err = 0; else if (!timeleft) err = -ETIMEDOUT; /* timed out */ else err = timeleft; /* killed */ } doutc(cl, "do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); /* only abort if we didn't race with a real reply */ if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { err = le32_to_cpu(req->r_reply_info.head->result); } else if (err < 0) { doutc(cl, "aborted request %lld with %d\n", req->r_tid, err); /* * ensure we aren't running concurrently with * ceph_fill_trace or ceph_readdir_prepopulate, which * rely on locks (dir mutex) held by our caller. */ mutex_lock(&req->r_fill_mutex); req->r_err = err; set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); if (req->r_parent && (req->r_op & CEPH_MDS_OP_WRITE)) ceph_invalidate_dir_request(req); } else { err = req->r_err; } mutex_unlock(&mdsc->mutex); return err; } /* * Synchrously perform an mds request. Take care of all of the * session setup, forwarding, retry details. */ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { struct ceph_client *cl = mdsc->fsc->client; int err; doutc(cl, "do_request on %p\n", req); /* issue */ err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) err = ceph_mdsc_wait_request(mdsc, req, NULL); doutc(cl, "do_request %p done, result %d\n", req, err); return err; } /* * Invalidate dir's completeness, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *dir = req->r_parent; struct inode *old_dir = req->r_old_dentry_dir; struct ceph_client *cl = req->r_mdsc->fsc->client; doutc(cl, "invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir); ceph_dir_clear_complete(dir); if (old_dir) ceph_dir_clear_complete(old_dir); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) ceph_invalidate_dentry_lease(req->r_old_dentry); } /* * Handle mds reply. * * We take the session mutex and parse and process the reply immediately. * This preserves the logical ordering of replies, capabilities, etc., sent * by the MDS as they are applied to our local cache. */ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ struct ceph_snap_realm *realm; u64 tid; int err, result; int mds = session->s_mds; bool close_sessions = false; if (msg->front.iov_len < sizeof(*head)) { pr_err_client(cl, "got corrupt (short) reply\n"); ceph_msg_dump(msg); return; } /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { doutc(cl, "on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); return; } doutc(cl, "handle_reply %p\n", req); /* correct session? */ if (req->r_session != session) { pr_err_client(cl, "got %llu on session mds%d not mds%d\n", tid, session->s_mds, req->r_session ? req->r_session->s_mds : -1); mutex_unlock(&mdsc->mutex); goto out; } /* dup? */ if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { pr_warn_client(cl, "got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { pr_warn_client(cl, "got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } result = le32_to_cpu(head->result); if (head->safe) { set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); /* last request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) complete_all(&mdsc->safe_umount_waiters); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { /* * We already handled the unsafe response, now do the * cleanup. No need to examine the response; the MDS * doesn't include any result info in the safe * response. And even if it did, there is nothing * useful we could do with a revised return value. */ doutc(cl, "got safe reply %llu, mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } } else { set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } doutc(cl, "tid %lld result %d\n", tid, result); if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) err = parse_reply_info(session, msg, req, (u64)-1); else err = parse_reply_info(session, msg, req, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); /* Must find target inode outside of mutexes to avoid deadlocks */ rinfo = &req->r_reply_info; if ((err >= 0) && rinfo->head->is_target) { struct inode *in = xchg(&req->r_new_inode, NULL); struct ceph_vino tvino = { .ino = le64_to_cpu(rinfo->targeti.in->ino), .snap = le64_to_cpu(rinfo->targeti.in->snapid) }; /* * If we ended up opening an existing inode, discard * r_new_inode */ if (req->r_op == CEPH_MDS_OP_CREATE && !req->r_reply_info.has_create_ino) { /* This should never happen on an async create */ WARN_ON_ONCE(req->r_deleg_ino); iput(in); in = NULL; } in = ceph_get_inode(mdsc->fsc->sb, tvino, in); if (IS_ERR(in)) { err = PTR_ERR(in); mutex_lock(&session->s_mutex); goto out_err; } req->r_target_inode = in; } mutex_lock(&session->s_mutex); if (err < 0) { pr_err_client(cl, "got corrupt reply mds%d(tid:%lld)\n", mds, tid); ceph_msg_dump(msg); goto out_err; } /* snap trace */ realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); err = ceph_update_snap_trace(mdsc, rinfo->snapblob, rinfo->snapblob + rinfo->snapblob_len, le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, &realm); if (err) { up_write(&mdsc->snap_rwsem); close_sessions = true; if (err == -EIO) ceph_msg_dump(msg); goto out_err; } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); } /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); current->journal_info = req; err = ceph_fill_trace(mdsc->fsc->sb, req); if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) err = ceph_readdir_prepopulate(req, req->r_session); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); if (realm) ceph_put_snap_realm(mdsc, realm); if (err == 0) { if (req->r_target_inode && test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); spin_lock(&ci->i_unsafe_lock); list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); spin_unlock(&ci->i_unsafe_lock); } ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } out_err: mutex_lock(&mdsc->mutex); if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { if (err) { req->r_err = err; } else { req->r_reply = ceph_msg_get(msg); set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); } } else { doutc(cl, "reply arrived after request %lld was aborted\n", tid); } mutex_unlock(&mdsc->mutex); mutex_unlock(&session->s_mutex); /* kick calling process */ complete_request(mdsc, req); ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency, req->r_end_latency, err); out: ceph_mdsc_put_request(req); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; } /* * handle mds notification that our request has been forwarded. */ static void handle_forward(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; u64 tid = le64_to_cpu(msg->hdr.tid); u32 next_mds; u32 fwd_seq; int err = -EINVAL; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; bool aborted = false; ceph_decode_need(&p, end, 2*sizeof(u32), bad); next_mds = ceph_decode_32(&p); fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); req = lookup_get_request(mdsc, tid); if (!req) { mutex_unlock(&mdsc->mutex); doutc(cl, "forward tid %llu to mds%d - req dne\n", tid, next_mds); return; /* dup reply? */ } if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { doutc(cl, "forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { /* * Avoid infinite retrying after overflow. * * The MDS will increase the fwd count and in client side * if the num_fwd is less than the one saved in request * that means the MDS is an old version and overflowed of * 8 bits. */ mutex_lock(&req->r_fill_mutex); req->r_err = -EMULTIHOP; set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); mutex_unlock(&req->r_fill_mutex); aborted = true; pr_warn_ratelimited_client(cl, "forward tid %llu seq overflow\n", tid); } else { /* resend. forward race not possible; mds would drop */ doutc(cl, "forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); __do_request(mdsc, req); } mutex_unlock(&mdsc->mutex); /* kick calling process */ if (aborted) complete_request(mdsc, req); ceph_mdsc_put_request(req); return; bad: pr_err_client(cl, "decode error err=%d\n", err); ceph_msg_dump(msg); } static int __decode_session_metadata(void **p, void *end, bool *blocklisted) { /* map<string,string> */ u32 n; bool err_str; ceph_decode_32_safe(p, end, n, bad); while (n-- > 0) { u32 len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); err_str = !strncmp(*p, "error_string", len); *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); /* * Match "blocklisted (blacklisted)" from newer MDSes, * or "blacklisted" from older MDSes. */ if (err_str && strnstr(*p, "blacklisted", len)) *blocklisted = true; *p += len; } return 0; bad: return -1; } /* * handle a mds session control message */ static void handle_session(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; int mds = session->s_mds; int msg_version = le16_to_cpu(msg->hdr.version); void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; struct ceph_mds_session_head *h; struct ceph_mds_cap_auth *cap_auths = NULL; u32 op, cap_auths_num = 0; u64 seq, features = 0; int wake = 0; bool blocklisted = false; u32 i; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); h = p; p += sizeof(*h); op = le32_to_cpu(h->op); seq = le64_to_cpu(h->seq); if (msg_version >= 3) { u32 len; /* version >= 2 and < 5, decode metadata, skip otherwise * as it's handled via flags. */ if (msg_version >= 5) ceph_decode_skip_map(&p, end, string, string, bad); else if (__decode_session_metadata(&p, end, &blocklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); if (len) { ceph_decode_64_safe(&p, end, features, bad); p += len - sizeof(features); } } if (msg_version >= 5) { u32 flags, len; /* version >= 4 */ ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */ ceph_decode_32_safe(&p, end, len, bad); /* len */ ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */ /* version >= 5, flags */ ceph_decode_32_safe(&p, end, flags, bad); if (flags & CEPH_SESSION_BLOCKLISTED) { pr_warn_client(cl, "mds%d session blocklisted\n", session->s_mds); blocklisted = true; } } if (msg_version >= 6) { ceph_decode_32_safe(&p, end, cap_auths_num, bad); doutc(cl, "cap_auths_num %d\n", cap_auths_num); if (cap_auths_num && op != CEPH_SESSION_OPEN) { WARN_ON_ONCE(op != CEPH_SESSION_OPEN); goto skip_cap_auths; } cap_auths = kcalloc(cap_auths_num, sizeof(struct ceph_mds_cap_auth), GFP_KERNEL); if (!cap_auths) { pr_err_client(cl, "No memory for cap_auths\n"); return; } for (i = 0; i < cap_auths_num; i++) { u32 _len, j; /* struct_v, struct_compat, and struct_len in MDSCapAuth */ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); /* struct_v, struct_compat, and struct_len in MDSCapMatch */ ceph_decode_skip_n(&p, end, 2 + sizeof(u32), bad); ceph_decode_64_safe(&p, end, cap_auths[i].match.uid, bad); ceph_decode_32_safe(&p, end, _len, bad); if (_len) { cap_auths[i].match.gids = kcalloc(_len, sizeof(u32), GFP_KERNEL); if (!cap_auths[i].match.gids) { pr_err_client(cl, "No memory for gids\n"); goto fail; } cap_auths[i].match.num_gids = _len; for (j = 0; j < _len; j++) ceph_decode_32_safe(&p, end, cap_auths[i].match.gids[j], bad); } ceph_decode_32_safe(&p, end, _len, bad); if (_len) { cap_auths[i].match.path = kcalloc(_len + 1, sizeof(char), GFP_KERNEL); if (!cap_auths[i].match.path) { pr_err_client(cl, "No memory for path\n"); goto fail; } ceph_decode_copy(&p, cap_auths[i].match.path, _len); /* Remove the tailing '/' */ while (_len && cap_auths[i].match.path[_len - 1] == '/') { cap_auths[i].match.path[_len - 1] = '\0'; _len -= 1; } } ceph_decode_32_safe(&p, end, _len, bad); if (_len) { cap_auths[i].match.fs_name = kcalloc(_len + 1, sizeof(char), GFP_KERNEL); if (!cap_auths[i].match.fs_name) { pr_err_client(cl, "No memory for fs_name\n"); goto fail; } ceph_decode_copy(&p, cap_auths[i].match.fs_name, _len); } ceph_decode_8_safe(&p, end, cap_auths[i].match.root_squash, bad); ceph_decode_8_safe(&p, end, cap_auths[i].readable, bad); ceph_decode_8_safe(&p, end, cap_auths[i].writeable, bad); doutc(cl, "uid %lld, num_gids %u, path %s, fs_name %s, root_squash %d, readable %d, writeable %d\n", cap_auths[i].match.uid, cap_auths[i].match.num_gids, cap_auths[i].match.path, cap_auths[i].match.fs_name, cap_auths[i].match.root_squash, cap_auths[i].readable, cap_auths[i].writeable); } } skip_cap_auths: mutex_lock(&mdsc->mutex); if (op == CEPH_SESSION_OPEN) { if (mdsc->s_cap_auths) { for (i = 0; i < mdsc->s_cap_auths_num; i++) { kfree(mdsc->s_cap_auths[i].match.gids); kfree(mdsc->s_cap_auths[i].match.path); kfree(mdsc->s_cap_auths[i].match.fs_name); } kfree(mdsc->s_cap_auths); } mdsc->s_cap_auths_num = cap_auths_num; mdsc->s_cap_auths = cap_auths; } if (op == CEPH_SESSION_CLOSE) { ceph_get_mds_session(session); __unregister_session(mdsc, session); } /* FIXME: this ttl calculation is generous */ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); doutc(cl, "mds%d %s %p state %s seq %llu\n", mds, ceph_session_op_name(op), session, ceph_session_state_name(session->s_state), seq); if (session->s_state == CEPH_MDS_SESSION_HUNG) { session->s_state = CEPH_MDS_SESSION_OPEN; pr_info_client(cl, "mds%d came back\n", session->s_mds); } switch (op) { case CEPH_SESSION_OPEN: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info_client(cl, "mds%d reconnect success\n", session->s_mds); session->s_features = features; if (session->s_state == CEPH_MDS_SESSION_OPEN) { pr_notice_client(cl, "mds%d is already opened\n", session->s_mds); } else { session->s_state = CEPH_MDS_SESSION_OPEN; renewed_caps(mdsc, session, 0); if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) metric_schedule_delayed(&mdsc->metric); } /* * The connection maybe broken and the session in client * side has been reinitialized, need to update the seq * anyway. */ if (!session->s_seq && seq) session->s_seq = seq; wake = 1; if (mdsc->stopping) __close_session(mdsc, session); break; case CEPH_SESSION_RENEWCAPS: if (session->s_renew_seq == seq) renewed_caps(mdsc, session, 1); break; case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info_client(cl, "mds%d reconnect denied\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_CLOSED; cleanup_session_requests(mdsc, session); remove_session_caps(session); wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); break; case CEPH_SESSION_STALE: pr_info_client(cl, "mds%d caps went stale, renewing\n", session->s_mds); atomic_inc(&session->s_cap_gen); session->s_cap_ttl = jiffies - 1; send_renew_caps(mdsc, session); break; case CEPH_SESSION_RECALL_STATE: ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; case CEPH_SESSION_FLUSHMSG: /* flush cap releases */ spin_lock(&session->s_cap_lock); if (session->s_num_cap_releases) ceph_flush_session_cap_releases(mdsc, session); spin_unlock(&session->s_cap_lock); send_flushmsg_ack(mdsc, session, seq); break; case CEPH_SESSION_FORCE_RO: doutc(cl, "force_session_readonly %p\n", session); spin_lock(&session->s_cap_lock); session->s_readonly = true; spin_unlock(&session->s_cap_lock); wake_up_session_caps(session, FORCE_RO); break; case CEPH_SESSION_REJECT: WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING); pr_info_client(cl, "mds%d rejected session\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); if (blocklisted) mdsc->fsc->blocklisted = true; wake = 2; /* for good measure */ break; default: pr_err_client(cl, "bad op %d mds%d\n", op, mds); WARN_ON(1); } mutex_unlock(&session->s_mutex); if (wake) { mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); if (wake == 2) kick_requests(mdsc, mds); mutex_unlock(&mdsc->mutex); } if (op == CEPH_SESSION_CLOSE) ceph_put_mds_session(session); return; bad: pr_err_client(cl, "corrupt message mds%d len %d\n", mds, (int)msg->front.iov_len); ceph_msg_dump(msg); fail: for (i = 0; i < cap_auths_num; i++) { kfree(cap_auths[i].match.gids); kfree(cap_auths[i].match.path); kfree(cap_auths[i].match.fs_name); } kfree(cap_auths); return; } void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps); } } void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req) { struct ceph_client *cl = req->r_mdsc->fsc->client; int dcaps; dcaps = xchg(&req->r_dir_caps, 0); if (dcaps) { doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps); } } /* * called under session->mutex. */ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_request *req, *nreq; struct rb_node *p; doutc(mdsc->fsc->client, "mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) __send_request(session, req, true); /* * also re-send old requests when MDS enters reconnect stage. So that MDS * can process completed request in clientreplay stage. */ p = rb_first(&mdsc->request_tree); while (p) { req = rb_entry(p, struct ceph_mds_request, r_node); p = rb_next(p); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) continue; if (req->r_attempts == 0) continue; /* only old requests */ if (!req->r_session) continue; if (req->r_session->s_mds != session->s_mds) continue; ceph_mdsc_release_dir_caps_async(req); __send_request(session, req, true); } mutex_unlock(&mdsc->mutex); } static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) { struct ceph_msg *reply; struct ceph_pagelist *_pagelist; struct page *page; __le32 *addr; int err = -ENOMEM; if (!recon_state->allow_multi) return -ENOSPC; /* can't handle message that contains both caps and realm */ BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms); /* pre-allocate new pagelist */ _pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!_pagelist) return -ENOMEM; reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); if (!reply) goto fail_msg; /* placeholder for nr_caps */ err = ceph_pagelist_encode_32(_pagelist, 0); if (err < 0) goto fail; if (recon_state->nr_caps) { /* currently encoding caps */ err = ceph_pagelist_encode_32(recon_state->pagelist, 0); if (err) goto fail; } else { /* placeholder for nr_realms (currently encoding relams) */ err = ceph_pagelist_encode_32(_pagelist, 0); if (err < 0) goto fail; } err = ceph_pagelist_encode_8(recon_state->pagelist, 1); if (err) goto fail; page = list_first_entry(&recon_state->pagelist->head, struct page, lru); addr = kmap_atomic(page); if (recon_state->nr_caps) { /* currently encoding caps */ *addr = cpu_to_le32(recon_state->nr_caps); } else { /* currently encoding relams */ *(addr + 1) = cpu_to_le32(recon_state->nr_realms); } kunmap_atomic(addr); reply->hdr.version = cpu_to_le16(5); reply->hdr.compat_version = cpu_to_le16(4); reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length); ceph_msg_data_add_pagelist(reply, recon_state->pagelist); ceph_con_send(&recon_state->session->s_con, reply); ceph_pagelist_release(recon_state->pagelist); recon_state->pagelist = _pagelist; recon_state->nr_caps = 0; recon_state->nr_realms = 0; recon_state->msg_version = 5; return 0; fail: ceph_msg_put(reply); fail_msg: ceph_pagelist_release(_pagelist); return err; } static struct dentry* d_find_primary(struct inode *inode) { struct dentry *alias, *dn = NULL; if (hlist_empty(&inode->i_dentry)) return NULL; spin_lock(&inode->i_lock); if (hlist_empty(&inode->i_dentry)) goto out_unlock; if (S_ISDIR(inode->i_mode)) { alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); if (!IS_ROOT(alias)) dn = dget(alias); goto out_unlock; } hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias) && (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { dn = dget_dlock(alias); } spin_unlock(&alias->d_lock); if (dn) break; } out_unlock: spin_unlock(&inode->i_lock); return dn; } /* * Encode information about a cap for a reconnect with the MDS. */ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = ceph_inode_to_client(inode); union { struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; struct dentry *dentry; struct ceph_cap *cap; char *path; int pathlen = 0, err; u64 pathbase; u64 snap_follows; dentry = d_find_primary(inode); if (dentry) { /* set pathbase to parent dir when msg_version >= 2 */ path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, recon_state->msg_version >= 2); dput(dentry); if (IS_ERR(path)) { err = PTR_ERR(path); goto out_err; } } else { path = NULL; pathbase = 0; } spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { spin_unlock(&ci->i_ceph_lock); err = 0; goto out_err; } doutc(cl, " adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ cap->mseq = 0; /* and migrate_seq */ cap->cap_gen = atomic_read(&cap->session->s_cap_gen); /* These are lost when the session goes away */ if (S_ISDIR(inode->i_mode)) { if (cap->issued & CEPH_CAP_DIR_CREATE) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); } cap->issued &= ~CEPH_CAP_ANY_DIR_OPS; } if (recon_state->msg_version >= 2) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { struct timespec64 ts; rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(i_size_read(inode)); ts = inode_get_mtime(inode); ceph_encode_timespec64(&rec.v1.mtime, &ts); ts = inode_get_atime(inode); ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } if (list_empty(&ci->i_cap_snaps)) { snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; } else { struct ceph_cap_snap *capsnap = list_first_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); snap_follows = capsnap->follows; } spin_unlock(&ci->i_ceph_lock); if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; struct ceph_filelock *flocks = NULL; size_t struct_len, total_len = sizeof(u64); u8 struct_v = 0; encode_again: if (rec.v2.flock_len) { ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); } else { num_fcntl_locks = 0; num_flock_locks = 0; } if (num_fcntl_locks + num_flock_locks > 0) { flocks = kmalloc_array(num_fcntl_locks + num_flock_locks, sizeof(struct ceph_filelock), GFP_NOFS); if (!flocks) { err = -ENOMEM; goto out_err; } err = ceph_encode_locks_to_buffer(inode, flocks, num_fcntl_locks, num_flock_locks); if (err) { kfree(flocks); flocks = NULL; if (err == -ENOSPC) goto encode_again; goto out_err; } } else { kfree(flocks); flocks = NULL; } if (recon_state->msg_version >= 3) { /* version, compat_version and struct_len */ total_len += 2 * sizeof(u8) + sizeof(u32); struct_v = 2; } /* * number of encoded locks is stable, so copy to pagelist */ struct_len = 2 * sizeof(u32) + (num_fcntl_locks + num_flock_locks) * sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ total_len += struct_len; if (pagelist->length + total_len > RECONNECT_MAX_SIZE) { err = send_reconnect_partial(recon_state); if (err) goto out_freeflocks; pagelist = recon_state->pagelist; } err = ceph_pagelist_reserve(pagelist, total_len); if (err) goto out_freeflocks; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); if (recon_state->msg_version >= 3) { ceph_pagelist_encode_8(pagelist, struct_v); ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); if (struct_v >= 2) ceph_pagelist_encode_64(pagelist, snap_follows); out_freeflocks: kfree(flocks); } else { err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + pathlen + sizeof(rec.v1)); if (err) goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); } out_err: ceph_mdsc_free_path(path, pathlen); if (!err) recon_state->nr_caps++; return err; } static int encode_snap_realms(struct ceph_mds_client *mdsc, struct ceph_reconnect_state *recon_state) { struct rb_node *p; struct ceph_pagelist *pagelist = recon_state->pagelist; struct ceph_client *cl = mdsc->fsc->client; int err = 0; if (recon_state->msg_version >= 4) { err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms); if (err < 0) goto fail; } /* * snaprealms. we provide mds with the ino, seq (version), and * parent for all of our realms. If the mds has any newer info, * it will tell us. */ for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { struct ceph_snap_realm *realm = rb_entry(p, struct ceph_snap_realm, node); struct ceph_mds_snaprealm_reconnect sr_rec; if (recon_state->msg_version >= 4) { size_t need = sizeof(u8) * 2 + sizeof(u32) + sizeof(sr_rec); if (pagelist->length + need > RECONNECT_MAX_SIZE) { err = send_reconnect_partial(recon_state); if (err) goto fail; pagelist = recon_state->pagelist; } err = ceph_pagelist_reserve(pagelist, need); if (err) goto fail; ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); } doutc(cl, " adding snap realm %llx seq %lld parent %llx\n", realm->ino, realm->seq, realm->parent_ino); sr_rec.ino = cpu_to_le64(realm->ino); sr_rec.seq = cpu_to_le64(realm->seq); sr_rec.parent = cpu_to_le64(realm->parent_ino); err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); if (err) goto fail; recon_state->nr_realms++; } fail: return err; } /* * If an MDS fails and recovers, clients need to reconnect in order to * reestablish shared state. This includes all caps issued through * this session _and_ the snap_realm hierarchy. Because it's not * clear which snap realms the mds cares about, we send everything we * know about.. that ensures we'll then get any new info the * recovering MDS might have. * * This is a relatively heavyweight operation, but it's rare. */ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_msg *reply; int mds = session->s_mds; int err = -ENOMEM; struct ceph_reconnect_state recon_state = { .session = session, }; LIST_HEAD(dispose); pr_info_client(cl, "mds%d reconnect start\n", mds); recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); if (!recon_state.pagelist) goto fail_nopagelist; reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); if (!reply) goto fail_nomsg; xa_destroy(&session->s_delegated_inos); mutex_lock(&session->s_mutex); session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; doutc(cl, "session %p state %s\n", session, ceph_session_state_name(session->s_state)); atomic_inc(&session->s_cap_gen); spin_lock(&session->s_cap_lock); /* don't know if session is readonly */ session->s_readonly = 0; /* * notify __ceph_remove_cap() that we are composing cap reconnect. * If a cap get released before being added to the cap reconnect, * __ceph_remove_cap() should skip queuing cap release. */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ detach_cap_releases(session, &dispose); spin_unlock(&session->s_cap_lock); dispose_cap_releases(mdsc, &dispose); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) shrink_dcache_parent(mdsc->fsc->sb->s_root); ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); /* replay unsafe requests */ replay_unsafe_requests(mdsc, session); ceph_early_kick_flushing_caps(mdsc, session); down_read(&mdsc->snap_rwsem); /* placeholder for nr_caps */ err = ceph_pagelist_encode_32(recon_state.pagelist, 0); if (err) goto fail; if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { recon_state.msg_version = 3; recon_state.allow_multi = true; } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) { recon_state.msg_version = 3; } else { recon_state.msg_version = 2; } /* traverse this session's caps */ err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; spin_unlock(&session->s_cap_lock); if (err < 0) goto fail; /* check if all realms can be encoded into current message */ if (mdsc->num_snap_realms) { size_t total_len = recon_state.pagelist->length + mdsc->num_snap_realms * sizeof(struct ceph_mds_snaprealm_reconnect); if (recon_state.msg_version >= 4) { /* number of realms */ total_len += sizeof(u32); /* version, compat_version and struct_len */ total_len += mdsc->num_snap_realms * (2 * sizeof(u8) + sizeof(u32)); } if (total_len > RECONNECT_MAX_SIZE) { if (!recon_state.allow_multi) { err = -ENOSPC; goto fail; } if (recon_state.nr_caps) { err = send_reconnect_partial(&recon_state); if (err) goto fail; } recon_state.msg_version = 5; } } err = encode_snap_realms(mdsc, &recon_state); if (err < 0) goto fail; if (recon_state.msg_version >= 5) { err = ceph_pagelist_encode_8(recon_state.pagelist, 0); if (err < 0) goto fail; } if (recon_state.nr_caps || recon_state.nr_realms) { struct page *page = list_first_entry(&recon_state.pagelist->head, struct page, lru); __le32 *addr = kmap_atomic(page); if (recon_state.nr_caps) { WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms); *addr = cpu_to_le32(recon_state.nr_caps); } else if (recon_state.msg_version >= 4) { *(addr + 1) = cpu_to_le32(recon_state.nr_realms); } kunmap_atomic(addr); } reply->hdr.version = cpu_to_le16(recon_state.msg_version); if (recon_state.msg_version >= 4) reply->hdr.compat_version = cpu_to_le16(4); reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length); ceph_msg_data_add_pagelist(reply, recon_state.pagelist); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); ceph_pagelist_release(recon_state.pagelist); return; fail: ceph_msg_put(reply); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); fail_nomsg: ceph_pagelist_release(recon_state.pagelist); fail_nopagelist: pr_err_client(cl, "error %d preparing reconnect for mds%d\n", err, mds); return; } /* * compare old and new mdsmaps, kicking requests * and closing out old connections as necessary * * called under mdsc->mutex. */ static void check_new_map(struct ceph_mds_client *mdsc, struct ceph_mdsmap *newmap, struct ceph_mdsmap *oldmap) { int i, j, err; int oldstate, newstate; struct ceph_mds_session *s; unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0}; struct ceph_client *cl = mdsc->fsc->client; doutc(cl, "new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); if (newmap->m_info) { for (i = 0; i < newmap->possible_max_rank; i++) { for (j = 0; j < newmap->m_info[i].num_export_targets; j++) set_bit(newmap->m_info[i].export_targets[j], targets); } } for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); doutc(cl, "mds%d state %s%s -> %s%s (session %s)\n", i, ceph_mds_state_name(oldstate), ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", ceph_mds_state_name(newstate), ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); if (i >= newmap->possible_max_rank) { /* force close session for stopped mds */ ceph_get_mds_session(s); __unregister_session(mdsc, s); __wake_requests(mdsc, &s->s_waiting); mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); cleanup_session_requests(mdsc, s); remove_session_caps(s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); kick_requests(mdsc, i); continue; } if (memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { /* just close it */ mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); mutex_lock(&mdsc->mutex); ceph_con_close(&s->s_con); mutex_unlock(&s->s_mutex); s->s_state = CEPH_MDS_SESSION_RESTARTING; } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } /* * send reconnect? */ if (s->s_state == CEPH_MDS_SESSION_RESTARTING && newstate >= CEPH_MDS_STATE_RECONNECT) { mutex_unlock(&mdsc->mutex); clear_bit(i, targets); send_mds_reconnect(mdsc, s); mutex_lock(&mdsc->mutex); } /* * kick request on any mds that has gone active. */ if (oldstate < CEPH_MDS_STATE_ACTIVE && newstate >= CEPH_MDS_STATE_ACTIVE) { if (oldstate != CEPH_MDS_STATE_CREATING && oldstate != CEPH_MDS_STATE_STARTING) pr_info_client(cl, "mds%d recovery completed\n", s->s_mds); kick_requests(mdsc, i); mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); mutex_lock(&mdsc->mutex); ceph_kick_flushing_caps(mdsc, s); mutex_unlock(&s->s_mutex); wake_up_session_caps(s, RECONNECT); } } /* * Only open and reconnect sessions that don't exist yet. */ for (i = 0; i < newmap->possible_max_rank; i++) { /* * In case the import MDS is crashed just after * the EImportStart journal is flushed, so when * a standby MDS takes over it and is replaying * the EImportStart journal the new MDS daemon * will wait the client to reconnect it, but the * client may never register/open the session yet. * * Will try to reconnect that MDS daemon if the * rank number is in the export targets array and * is the up:reconnect state. */ newstate = ceph_mdsmap_get_state(newmap, i); if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT) continue; /* * The session maybe registered and opened by some * requests which were choosing random MDSes during * the mdsc->mutex's unlock/lock gap below in rare * case. But the related MDS daemon will just queue * that requests and be still waiting for the client's * reconnection request in up:reconnect state. */ s = __ceph_lookup_mds_session(mdsc, i); if (likely(!s)) { s = __open_export_target_session(mdsc, i); if (IS_ERR(s)) { err = PTR_ERR(s); pr_err_client(cl, "failed to open export target session, err %d\n", err); continue; } } doutc(cl, "send reconnect to export target mds.%d\n", i); mutex_unlock(&mdsc->mutex); send_mds_reconnect(mdsc, s); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; if (!ceph_mdsmap_is_laggy(newmap, i)) continue; if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG || s->s_state == CEPH_MDS_SESSION_CLOSING) { doutc(cl, " connecting to export targets of laggy mds%d\n", i); __open_export_target_sessions(mdsc, s); } } } /* * leases */ /* * caller must hold session s_mutex, dentry->d_lock */ void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); ceph_put_mds_session(di->lease_session); di->lease_session = NULL; } static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_client *cl = mdsc->fsc->client; struct super_block *sb = mdsc->fsc->sb; struct inode *inode; struct dentry *parent, *dentry; struct ceph_dentry_info *di; int mds = session->s_mds; struct ceph_mds_lease *h = msg->front.iov_base; u32 seq; struct ceph_vino vino; struct qstr dname; int release = 0; doutc(cl, "from mds%d\n", mds); if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) goto bad; vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); dname.len = get_unaligned_le32(h + 1); if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len) goto bad; dname.name = (void *)(h + 1) + sizeof(u32); /* lookup inode */ inode = ceph_find_inode(sb, vino); doutc(cl, "%s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); mutex_lock(&session->s_mutex); if (!inode) { doutc(cl, "no inode %llx\n", vino.ino); goto release; } /* dentry */ parent = d_find_alias(inode); if (!parent) { doutc(cl, "no parent dentry on inode %p\n", inode); WARN_ON(1); goto release; /* hrm... */ } dname.hash = full_name_hash(parent, dname.name, dname.len); dentry = d_lookup(parent, &dname); dput(parent); if (!dentry) goto release; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); switch (h->action) { case CEPH_MDS_LEASE_REVOKE: if (di->lease_session == session) { if (ceph_seq_cmp(di->lease_seq, seq) > 0) h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); } release = 1; break; case CEPH_MDS_LEASE_RENEW: if (di->lease_session == session && di->lease_gen == atomic_read(&session->s_cap_gen) && di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; di->time = di->lease_renew_from + duration; di->lease_renew_after = di->lease_renew_from + (duration >> 1); di->lease_renew_from = 0; } break; } spin_unlock(&dentry->d_lock); dput(dentry); if (!release) goto out; release: /* let's just reuse the same message */ h->action = CEPH_MDS_LEASE_REVOKE_ACK; ceph_msg_get(msg); ceph_con_send(&session->s_con, msg); out: mutex_unlock(&session->s_mutex); iput(inode); ceph_dec_mds_stopping_blocker(mdsc); return; bad: ceph_dec_mds_stopping_blocker(mdsc); pr_err_client(cl, "corrupt lease message\n"); ceph_msg_dump(msg); } void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, struct dentry *dentry, char action, u32 seq) { struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_msg *msg; struct ceph_mds_lease *lease; struct inode *dir; int len = sizeof(*lease) + sizeof(u32) + NAME_MAX; doutc(cl, "identry %p %s to mds%d\n", dentry, ceph_lease_op_name(action), session->s_mds); msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; lease->action = action; lease->seq = cpu_to_le32(seq); spin_lock(&dentry->d_lock); dir = d_inode(dentry->d_parent); lease->ino = cpu_to_le64(ceph_ino(dir)); lease->first = lease->last = cpu_to_le64(ceph_snap(dir)); put_unaligned_le32(dentry->d_name.len, lease + 1); memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dentry->d_name.len); spin_unlock(&dentry->d_lock); ceph_con_send(&session->s_con, msg); } /* * lock unlock the session, to wait ongoing session activities */ static void lock_unlock_session(struct ceph_mds_session *s) { mutex_lock(&s->s_mutex); mutex_unlock(&s->s_mutex); } static void maybe_recover_session(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_fs_client *fsc = mdsc->fsc; if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) return; if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) return; if (!READ_ONCE(fsc->blocklisted)) return; pr_info_client(cl, "auto reconnect after blocklisted\n"); ceph_force_reconnect(fsc->sb); } bool check_session_state(struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { s->s_state = CEPH_MDS_SESSION_HUNG; pr_info_client(cl, "mds%d hung\n", s->s_mds); } break; case CEPH_MDS_SESSION_CLOSING: case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: case CEPH_MDS_SESSION_CLOSED: case CEPH_MDS_SESSION_REJECTED: return false; } return true; } /* * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, * then we need to retransmit that request. */ void inc_session_sequence(struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; lockdep_assert_held(&s->s_mutex); s->s_seq++; if (s->s_state == CEPH_MDS_SESSION_CLOSING) { int ret; doutc(cl, "resending session close request for mds%d\n", s->s_mds); ret = request_close_session(s); if (ret < 0) pr_err_client(cl, "unable to close session to mds%d: %d\n", s->s_mds, ret); } } /* * delayed work -- periodically trim expired leases, renew caps with mds. If * the @delay parameter is set to 0 or if it's more than 5 secs, the default * workqueue delay value of 5 secs will be used. */ static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay) { unsigned long max_delay = HZ * 5; /* 5 secs default delay */ if (!delay || (delay > max_delay)) delay = max_delay; schedule_delayed_work(&mdsc->delayed_work, round_jiffies_relative(delay)); } static void delayed_work(struct work_struct *work) { struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, delayed_work.work); unsigned long delay; int renew_interval; int renew_caps; int i; doutc(mdsc->fsc->client, "mdsc delayed_work\n"); if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED) return; mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; renew_caps = time_after_eq(jiffies, HZ*renew_interval + mdsc->last_renew_caps); if (renew_caps) mdsc->last_renew_caps = jiffies; for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); if (!s) continue; if (!check_session_state(s)) { ceph_put_mds_session(s); continue; } mutex_unlock(&mdsc->mutex); ceph_flush_session_cap_releases(mdsc, s); mutex_lock(&s->s_mutex); if (renew_caps) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); delay = ceph_check_delayed_caps(mdsc); ceph_queue_cap_reclaim_work(mdsc); ceph_trim_snapid_map(mdsc); maybe_recover_session(mdsc); schedule_delayed(mdsc, delay); } int ceph_mdsc_init(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc; int err; mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); if (!mdsc) return -ENOMEM; mdsc->fsc = fsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); if (!mdsc->mdsmap) { err = -ENOMEM; goto err_mdsc; } init_completion(&mdsc->safe_umount_waiters); spin_lock_init(&mdsc->stopping_lock); atomic_set(&mdsc->stopping_blockers, 0); init_completion(&mdsc->stopping_waiter); atomic64_set(&mdsc->dirty_folios, 0); init_waitqueue_head(&mdsc->flush_end_wq); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; mutex_init(&mdsc->quotarealms_inodes_mutex); init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); #ifdef CONFIG_DEBUG_FS INIT_LIST_HEAD(&mdsc->cap_wait_list); #endif spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); mdsc->last_cap_flush_tid = 1; INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; spin_lock_init(&mdsc->dentry_list_lock); INIT_LIST_HEAD(&mdsc->dentry_leases); INIT_LIST_HEAD(&mdsc->dentry_dir_leases); ceph_caps_init(mdsc); ceph_adjust_caps_max_min(mdsc, fsc->mount_options); spin_lock_init(&mdsc->snapid_map_lock); mdsc->snapid_map_tree = RB_ROOT; INIT_LIST_HEAD(&mdsc->snapid_map_lru); init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; strscpy(mdsc->nodename, utsname()->nodename, sizeof(mdsc->nodename)); fsc->mdsc = mdsc; return 0; err_mdsmap: kfree(mdsc->mdsmap); err_mdsc: kfree(mdsc); return err; } /* * Wait for safe replies on open mds requests. If we time out, drop * all requests from the tree to avoid dangling dentry refs. */ static void wait_requests(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { mutex_unlock(&mdsc->mutex); doutc(cl, "waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); while ((req = __get_oldest_req(mdsc))) { doutc(cl, "timed out on tid %llu\n", req->r_tid); list_del_init(&req->r_wait); __unregister_request(mdsc, req); } } mutex_unlock(&mdsc->mutex); doutc(cl, "done\n"); } void send_flush_mdlog(struct ceph_mds_session *s) { struct ceph_client *cl = s->s_mdsc->fsc->client; struct ceph_msg *msg; /* * Pre-luminous MDS crashes when it sees an unknown session request */ if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) return; mutex_lock(&s->s_mutex); doutc(cl, "request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, s->s_seq); if (!msg) { pr_err_client(cl, "failed to request mdlog flush to mds%d (%s) seq %lld\n", s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); } else { ceph_con_send(&s->s_con, msg); } mutex_unlock(&s->s_mutex); } static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, struct ceph_mds_cap_auth *auth, const struct cred *cred, char *tpath) { u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); struct ceph_client *cl = mdsc->fsc->client; const char *spath = mdsc->fsc->mount_options->server_path; bool gid_matched = false; u32 gid, tlen, len; int i, j; doutc(cl, "match.uid %lld\n", auth->match.uid); if (auth->match.uid != MDS_AUTH_UID_ANY) { if (auth->match.uid != caller_uid) return 0; if (auth->match.num_gids) { for (i = 0; i < auth->match.num_gids; i++) { if (caller_gid == auth->match.gids[i]) gid_matched = true; } if (!gid_matched && cred->group_info->ngroups) { for (i = 0; i < cred->group_info->ngroups; i++) { gid = from_kgid(&init_user_ns, cred->group_info->gid[i]); for (j = 0; j < auth->match.num_gids; j++) { if (gid == auth->match.gids[j]) { gid_matched = true; break; } } if (gid_matched) break; } } if (!gid_matched) return 0; } } /* path match */ if (auth->match.path) { if (!tpath) return 0; tlen = strlen(tpath); len = strlen(auth->match.path); if (len) { char *_tpath = tpath; bool free_tpath = false; int m, n; doutc(cl, "server path %s, tpath %s, match.path %s\n", spath, tpath, auth->match.path); if (spath && (m = strlen(spath)) != 1) { /* mount path + '/' + tpath + an extra space */ n = m + 1 + tlen + 1; _tpath = kmalloc(n, GFP_NOFS); if (!_tpath) return -ENOMEM; /* remove the leading '/' */ snprintf(_tpath, n, "%s/%s", spath + 1, tpath); free_tpath = true; tlen = strlen(_tpath); } /* * Please note the tailing '/' for match.path has already * been removed when parsing. * * Remove the tailing '/' for the target path. */ while (tlen && _tpath[tlen - 1] == '/') { _tpath[tlen - 1] = '\0'; tlen -= 1; } doutc(cl, "_tpath %s\n", _tpath); /* * In case first == _tpath && tlen == len: * match.path=/foo --> /foo _path=/foo --> match * match.path=/foo/ --> /foo _path=/foo --> match * * In case first == _tmatch.path && tlen > len: * match.path=/foo/ --> /foo _path=/foo/ --> match * match.path=/foo --> /foo _path=/foo/ --> match * match.path=/foo/ --> /foo _path=/foo/d --> match * match.path=/foo --> /foo _path=/food --> mismatch * * All the other cases --> mismatch */ bool path_matched = true; char *first = strstr(_tpath, auth->match.path); if (first != _tpath || (tlen > len && _tpath[len] != '/')) { path_matched = false; } if (free_tpath) kfree(_tpath); if (!path_matched) return 0; } } doutc(cl, "matched\n"); return 1; } int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask) { const struct cred *cred = get_current_cred(); u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); struct ceph_mds_cap_auth *rw_perms_s = NULL; struct ceph_client *cl = mdsc->fsc->client; bool root_squash_perms = true; int i, err; doutc(cl, "tpath '%s', mask %d, caller_uid %d, caller_gid %d\n", tpath, mask, caller_uid, caller_gid); for (i = 0; i < mdsc->s_cap_auths_num; i++) { struct ceph_mds_cap_auth *s = &mdsc->s_cap_auths[i]; err = ceph_mds_auth_match(mdsc, s, cred, tpath); if (err < 0) { put_cred(cred); return err; } else if (err > 0) { /* always follow the last auth caps' permission */ root_squash_perms = true; rw_perms_s = NULL; if ((mask & MAY_WRITE) && s->writeable && s->match.root_squash && (!caller_uid || !caller_gid)) root_squash_perms = false; if (((mask & MAY_WRITE) && !s->writeable) || ((mask & MAY_READ) && !s->readable)) rw_perms_s = s; } } put_cred(cred); doutc(cl, "root_squash_perms %d, rw_perms_s %p\n", root_squash_perms, rw_perms_s); if (root_squash_perms && rw_perms_s == NULL) { doutc(cl, "access allowed\n"); return 0; } if (!root_squash_perms) { doutc(cl, "root_squash is enabled and user(%d %d) isn't allowed to write", caller_uid, caller_gid); } if (rw_perms_s) { doutc(cl, "mds auth caps readable/writeable %d/%d while request r/w %d/%d", rw_perms_s->readable, rw_perms_s->writeable, !!(mask & MAY_READ), !!(mask & MAY_WRITE)); } doutc(cl, "access denied\n"); return -EACCES; } /* * called before mount is ro, and before dentries are torn down. * (hmm, does this still race with new lookups?) */ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) { doutc(mdsc->fsc->client, "begin\n"); mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN; ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); /* * wait for reply handlers to drop their request refs and * their inode/dcache refs */ ceph_msgr_flush(); ceph_cleanup_quotarealms_inodes(mdsc); doutc(mdsc->fsc->client, "done\n"); } /* * flush the mdlog and wait for all write mds requests to flush. */ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req = NULL, *nextreq; struct ceph_mds_session *last_session = NULL; struct rb_node *n; mutex_lock(&mdsc->mutex); doutc(cl, "want %lld\n", want_tid); restart: req = __get_oldest_req(mdsc); while (req && req->r_tid <= want_tid) { /* find next request */ n = rb_next(&req->r_node); if (n) nextreq = rb_entry(n, struct ceph_mds_request, r_node); else nextreq = NULL; if (req->r_op != CEPH_MDS_OP_SETFILELOCK && (req->r_op & CEPH_MDS_OP_WRITE)) { struct ceph_mds_session *s = req->r_session; if (!s) { req = nextreq; continue; } /* write op */ ceph_mdsc_get_request(req); if (nextreq) ceph_mdsc_get_request(nextreq); s = ceph_get_mds_session(s); mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDS */ if (last_session != s) { send_flush_mdlog(s); ceph_put_mds_session(last_session); last_session = s; } else { ceph_put_mds_session(s); } doutc(cl, "wait on %llu (want %llu)\n", req->r_tid, want_tid); wait_for_completion(&req->r_safe_completion); mutex_lock(&mdsc->mutex); ceph_mdsc_put_request(req); if (!nextreq) break; /* next dne before, so we're done! */ if (RB_EMPTY_NODE(&nextreq->r_node)) { /* next request was removed from tree */ ceph_mdsc_put_request(nextreq); goto restart; } ceph_mdsc_put_request(nextreq); /* won't go away */ } req = nextreq; } mutex_unlock(&mdsc->mutex); ceph_put_mds_session(last_session); doutc(cl, "done\n"); } void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; u64 want_tid, want_flush; if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) return; doutc(cl, "sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; mutex_unlock(&mdsc->mutex); ceph_flush_dirty_caps(mdsc); ceph_flush_cap_releases(mdsc); spin_lock(&mdsc->cap_dirty_lock); want_flush = mdsc->last_cap_flush_tid; if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_last_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); cf->wake = true; } spin_unlock(&mdsc->cap_dirty_lock); doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush); flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid); wait_caps_flush(mdsc, want_flush); } /* * true if all sessions are closed, or we force unmount */ static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) { if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) return true; return atomic_read(&mdsc->num_sessions) <= skipped; } /* * called after sb is ro or when metadata corrupted. */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *session; int i; int skipped = 0; doutc(cl, "begin\n"); /* close sessions */ mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { session = __ceph_lookup_mds_session(mdsc, i); if (!session) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); if (__close_session(mdsc, session) <= 0) skipped++; mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); doutc(cl, "waiting for sessions to close\n"); wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc, skipped), ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { session = ceph_get_mds_session(mdsc->sessions[i]); __unregister_session(mdsc, session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); remove_session_caps(session); mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } } WARN_ON(!list_empty(&mdsc->cap_delay_list)); mutex_unlock(&mdsc->mutex); ceph_cleanup_snapid_map(mdsc); ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); cancel_work_sync(&mdsc->cap_unlink_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ doutc(cl, "done\n"); } void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int mds; doutc(mdsc->fsc->client, "force umount\n"); mutex_lock(&mdsc->mutex); for (mds = 0; mds < mdsc->max_sessions; mds++) { session = __ceph_lookup_mds_session(mdsc, mds); if (!session) continue; if (session->s_state == CEPH_MDS_SESSION_REJECTED) __unregister_session(mdsc, session); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); __close_session(mdsc, session); if (session->s_state == CEPH_MDS_SESSION_CLOSING) { cleanup_session_requests(mdsc, session); remove_session_caps(session); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); kick_requests(mdsc, mds); } __wake_requests(mdsc, &mdsc->waiting_for_map); mutex_unlock(&mdsc->mutex); } static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { doutc(mdsc->fsc->client, "stop\n"); /* * Make sure the delayed work stopped before releasing * the resources. * * Because the cancel_delayed_work_sync() will only * guarantee that the work finishes executing. But the * delayed work will re-arm itself again after that. */ flush_delayed_work(&mdsc->delayed_work); if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); if (mdsc->s_cap_auths) { int i; for (i = 0; i < mdsc->s_cap_auths_num; i++) { kfree(mdsc->s_cap_auths[i].match.gids); kfree(mdsc->s_cap_auths[i].match.path); kfree(mdsc->s_cap_auths[i].match.fs_name); } kfree(mdsc->s_cap_auths); } ceph_pool_perm_destroy(mdsc); } void ceph_mdsc_destroy(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc = fsc->mdsc; doutc(fsc->client, "%p\n", mdsc); if (!mdsc) return; /* flush out any connection work with references to us */ ceph_msgr_flush(); ceph_mdsc_stop(mdsc); ceph_metric_destroy(&mdsc->metric); fsc->mdsc = NULL; kfree(mdsc); doutc(fsc->client, "%p done\n", mdsc); } void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { struct ceph_fs_client *fsc = mdsc->fsc; struct ceph_client *cl = fsc->client; const char *mds_namespace = fsc->mount_options->mds_namespace; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; u32 epoch; u32 num_fs; u32 mount_fscid = (u32)-1; int err = -EINVAL; ceph_decode_need(&p, end, sizeof(u32), bad); epoch = ceph_decode_32(&p); doutc(cl, "epoch %u\n", epoch); /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */ ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad); ceph_decode_32_safe(&p, end, num_fs, bad); while (num_fs-- > 0) { void *info_p, *info_end; u32 info_len; u32 fscid, namelen; ceph_decode_need(&p, end, 2 + sizeof(u32), bad); p += 2; // info_v, info_cv info_len = ceph_decode_32(&p); ceph_decode_need(&p, end, info_len, bad); info_p = p; info_end = p + info_len; p = info_end; ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad); fscid = ceph_decode_32(&info_p); namelen = ceph_decode_32(&info_p); ceph_decode_need(&info_p, info_end, namelen, bad); if (mds_namespace && strlen(mds_namespace) == namelen && !strncmp(mds_namespace, (char *)info_p, namelen)) { mount_fscid = fscid; break; } } ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch); if (mount_fscid != (u32)-1) { fsc->client->monc.fs_cluster_id = mount_fscid; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); ceph_monc_renew_subs(&fsc->client->monc); } else { err = -ENOENT; goto err_out; } return; bad: pr_err_client(cl, "error decoding fsmap %d. Shutting down mount.\n", err); ceph_umount_begin(mdsc->fsc->sb); ceph_msg_dump(msg); err_out: mutex_lock(&mdsc->mutex); mdsc->mdsmap_err = err; __wake_requests(mdsc, &mdsc->waiting_for_map); mutex_unlock(&mdsc->mutex); } /* * handle mds map update. */ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) { struct ceph_client *cl = mdsc->fsc->client; u32 epoch; u32 maplen; void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; struct ceph_mdsmap *newmap, *oldmap; struct ceph_fsid fsid; int err = -EINVAL; ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); doutc(cl, "epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { doutc(cl, "epoch %u <= our %u\n", epoch, mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); return; } newmap = ceph_mdsmap_decode(mdsc, &p, end, ceph_msgr2(mdsc->fsc->client)); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad_unlock; } /* swap into place */ if (mdsc->mdsmap) { oldmap = mdsc->mdsmap; mdsc->mdsmap = newmap; check_new_map(mdsc, newmap, oldmap); ceph_mdsmap_destroy(oldmap); } else { mdsc->mdsmap = newmap; /* first mds map */ } mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size, MAX_LFS_FILESIZE); __wake_requests(mdsc, &mdsc->waiting_for_map); ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); schedule_delayed(mdsc, 0); return; bad_unlock: mutex_unlock(&mdsc->mutex); bad: pr_err_client(cl, "error decoding mdsmap %d. Shutting down mount.\n", err); ceph_umount_begin(mdsc->fsc->sb); ceph_msg_dump(msg); return; } static struct ceph_connection *mds_get_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; if (ceph_get_mds_session(s)) return con; return NULL; } static void mds_put_con(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; ceph_put_mds_session(s); } /* * if the client is unresponsive for long enough, the mds will kill * the session entirely. */ static void mds_peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n", s->s_mds); if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO && ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT) send_mds_reconnect(mdsc, s); } static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; int type = le16_to_cpu(msg->hdr.type); mutex_lock(&mdsc->mutex); if (__verify_registered_session(mdsc, s) < 0) { mutex_unlock(&mdsc->mutex); goto out; } mutex_unlock(&mdsc->mutex); switch (type) { case CEPH_MSG_MDS_MAP: ceph_mdsc_handle_mdsmap(mdsc, msg); break; case CEPH_MSG_FS_MAP_USER: ceph_mdsc_handle_fsmap(mdsc, msg); break; case CEPH_MSG_CLIENT_SESSION: handle_session(s, msg); break; case CEPH_MSG_CLIENT_REPLY: handle_reply(s, msg); break; case CEPH_MSG_CLIENT_REQUEST_FORWARD: handle_forward(mdsc, s, msg); break; case CEPH_MSG_CLIENT_CAPS: ceph_handle_caps(s, msg); break; case CEPH_MSG_CLIENT_SNAP: ceph_handle_snap(mdsc, s, msg); break; case CEPH_MSG_CLIENT_LEASE: handle_lease(mdsc, s, msg); break; case CEPH_MSG_CLIENT_QUOTA: ceph_handle_quota(mdsc, s, msg); break; default: pr_err_client(cl, "received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } out: ceph_msg_put(msg); } /* * authentication */ /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. */ static struct ceph_auth_handshake * mds_get_authorizer(struct ceph_connection *con, int *proto, int force_new) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; int ret; ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, force_new, proto, NULL, NULL); if (ret) return ERR_PTR(ret); return auth; } static int mds_add_authorizer_challenge(struct ceph_connection *con, void *challenge_buf, int challenge_buf_len) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer, challenge_buf, challenge_buf_len); } static int mds_verify_authorizer_reply(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, NULL, NULL, NULL, NULL); } static int mds_invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } static int mds_get_auth_request(struct ceph_connection *con, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_mds_session *s = con->private; struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; int ret; ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int mds_handle_auth_reply_more(struct ceph_connection *con, void *reply, int reply_len, void *buf, int *buf_len, void **authorizer, int *authorizer_len) { struct ceph_mds_session *s = con->private; struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; int ret; ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, buf, buf_len); if (ret) return ret; *authorizer = auth->authorizer_buf; *authorizer_len = auth->authorizer_buf_len; return 0; } static int mds_handle_auth_done(struct ceph_connection *con, u64 global_id, void *reply, int reply_len, u8 *session_key, int *session_key_len, u8 *con_secret, int *con_secret_len) { struct ceph_mds_session *s = con->private; struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, session_key, session_key_len, con_secret, con_secret_len); } static int mds_handle_auth_bad_method(struct ceph_connection *con, int used_proto, int result, const int *allowed_protos, int proto_cnt, const int *allowed_modes, int mode_cnt) { struct ceph_mds_session *s = con->private; struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc; int ret; if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS, used_proto, result, allowed_protos, proto_cnt, allowed_modes, mode_cnt)) { ret = ceph_monc_validate_auth(monc); if (ret) return ret; } return -EACCES; } static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { struct ceph_msg *msg; int type = (int) le16_to_cpu(hdr->type); int front_len = (int) le32_to_cpu(hdr->front_len); if (con->in_msg) return con->in_msg; *skip = 0; msg = ceph_msg_new(type, front_len, GFP_NOFS, false); if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); return NULL; } return msg; } static int mds_sign_message(struct ceph_msg *msg) { struct ceph_mds_session *s = msg->con->private; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_sign_message(auth, msg); } static int mds_check_message_signature(struct ceph_msg *msg) { struct ceph_mds_session *s = msg->con->private; struct ceph_auth_handshake *auth = &s->s_auth; return ceph_auth_check_message_signature(auth, msg); } static const struct ceph_connection_operations mds_con_ops = { .get = mds_get_con, .put = mds_put_con, .alloc_msg = mds_alloc_msg, .dispatch = mds_dispatch, .peer_reset = mds_peer_reset, .get_authorizer = mds_get_authorizer, .add_authorizer_challenge = mds_add_authorizer_challenge, .verify_authorizer_reply = mds_verify_authorizer_reply, .invalidate_authorizer = mds_invalidate_authorizer, .sign_message = mds_sign_message, .check_message_signature = mds_check_message_signature, .get_auth_request = mds_get_auth_request, .handle_auth_reply_more = mds_handle_auth_reply_more, .handle_auth_done = mds_handle_auth_done, .handle_auth_bad_method = mds_handle_auth_bad_method, }; /* eof */
13273 13273 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 /* SPDX-License-Identifier: GPL-2.0 */ /* * Latched RB-trees * * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org> * * Since RB-trees have non-atomic modifications they're not immediately suited * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for * lockless lookups; we cannot guarantee they return a correct result. * * The simplest solution is a seqlock + RB-tree, this will allow lockless * lookups; but has the constraint (inherent to the seqlock) that read sides * cannot nest in write sides. * * If we need to allow unconditional lookups (say as required for NMI context * usage) we need a more complex setup; this data structure provides this by * employing the latch technique -- see @write_seqcount_latch_begin -- to * implement a latched RB-tree which does allow for unconditional lookups by * virtue of always having (at least) one stable copy of the tree. * * However, while we have the guarantee that there is at all times one stable * copy, this does not guarantee an iteration will not observe modifications. * What might have been a stable copy at the start of the iteration, need not * remain so for the duration of the iteration. * * Therefore, this does require a lockless RB-tree iteration to be non-fatal; * see the comment in lib/rbtree.c. Note however that we only require the first * condition -- not seeing partial stores -- because the latch thing isolates * us from loops. If we were to interrupt a modification the lookup would be * pointed at the stable tree and complete while the modification was halted. */ #ifndef RB_TREE_LATCH_H #define RB_TREE_LATCH_H #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/rcupdate.h> struct latch_tree_node { struct rb_node node[2]; }; struct latch_tree_root { seqcount_latch_t seq; struct rb_root tree[2]; }; /** * latch_tree_ops - operators to define the tree order * @less: used for insertion; provides the (partial) order between two elements. * @comp: used for lookups; provides the order between the search key and an element. * * The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * latch_tree_find(). */ struct latch_tree_ops { bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b); int (*comp)(void *key, struct latch_tree_node *b); }; static __always_inline struct latch_tree_node * __lt_from_rb(struct rb_node *node, int idx) { return container_of(node, struct latch_tree_node, node[idx]); } static __always_inline void __lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx, bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b)) { struct rb_root *root = &ltr->tree[idx]; struct rb_node **link = &root->rb_node; struct rb_node *node = &ltn->node[idx]; struct rb_node *parent = NULL; struct latch_tree_node *ltp; while (*link) { parent = *link; ltp = __lt_from_rb(parent, idx); if (less(ltn, ltp)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, root); } static __always_inline void __lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx) { rb_erase(&ltn->node[idx], &ltr->tree[idx]); } static __always_inline struct latch_tree_node * __lt_find(void *key, struct latch_tree_root *ltr, int idx, int (*comp)(void *key, struct latch_tree_node *node)) { struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node); struct latch_tree_node *ltn; int c; while (node) { ltn = __lt_from_rb(node, idx); c = comp(key, ltn); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return ltn; } return NULL; } /** * latch_tree_insert() - insert @node into the trees @root * @node: nodes to insert * @root: trees to insert @node into * @ops: operators defining the node order * * It inserts @node into @root in an ordered fashion such that we can always * observe one complete tree. See the comment for write_seqcount_latch_begin(). * * The inserts use rcu_assign_pointer() to publish the element such that the * tree structure is stored before we can observe the new @node. * * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be * serialized. */ static __always_inline void latch_tree_insert(struct latch_tree_node *node, struct latch_tree_root *root, const struct latch_tree_ops *ops) { write_seqcount_latch_begin(&root->seq); __lt_insert(node, root, 0, ops->less); write_seqcount_latch(&root->seq); __lt_insert(node, root, 1, ops->less); write_seqcount_latch_end(&root->seq); } /** * latch_tree_erase() - removes @node from the trees @root * @node: nodes to remote * @root: trees to remove @node from * @ops: operators defining the node order * * Removes @node from the trees @root in an ordered fashion such that we can * always observe one complete tree. See the comment for * write_seqcount_latch_begin(). * * It is assumed that @node will observe one RCU quiescent state before being * reused of freed. * * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be * serialized. */ static __always_inline void latch_tree_erase(struct latch_tree_node *node, struct latch_tree_root *root, const struct latch_tree_ops *ops) { write_seqcount_latch_begin(&root->seq); __lt_erase(node, root, 0); write_seqcount_latch(&root->seq); __lt_erase(node, root, 1); write_seqcount_latch_end(&root->seq); } /** * latch_tree_find() - find the node matching @key in the trees @root * @key: search key * @root: trees to search for @key * @ops: operators defining the node order * * Does a lockless lookup in the trees @root for the node matching @key. * * It is assumed that this is called while holding the appropriate RCU read * side lock. * * If the operators define a partial order on the elements (there are multiple * elements which have the same key value) it is undefined which of these * elements will be found. Nor is it possible to iterate the tree to find * further elements with the same key value. * * Returns: a pointer to the node matching @key or NULL. */ static __always_inline struct latch_tree_node * latch_tree_find(void *key, struct latch_tree_root *root, const struct latch_tree_ops *ops) { struct latch_tree_node *node; unsigned int seq; do { seq = read_seqcount_latch(&root->seq); node = __lt_find(key, root, seq & 1, ops->comp); } while (read_seqcount_latch_retry(&root->seq, seq)); return node; } #endif /* RB_TREE_LATCH_H */
34 32 3 32 1 1 1 29 29 25 4 3 13 10 3 3 2 1 3 79 11 17 17 37 25 17 6 11 16 1 16 13 3 13 4 4 2 2 1 3 1 1 1 2 4 4 1 3 2 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/resize.c * * Support for resizing an ext4 filesystem while it is mounted. * * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com> * * This could probably be made into a module, because it is not often in use. */ #include <linux/errno.h> #include <linux/slab.h> #include <linux/jiffies.h> #include "ext4_jbd2.h" struct ext4_rcu_ptr { struct rcu_head rcu; void *ptr; }; static void ext4_rcu_ptr_callback(struct rcu_head *head) { struct ext4_rcu_ptr *ptr; ptr = container_of(head, struct ext4_rcu_ptr, rcu); kvfree(ptr->ptr); kfree(ptr); } void ext4_kvfree_array_rcu(void *to_free) { struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); if (ptr) { ptr->ptr = to_free; call_rcu(&ptr->rcu, ext4_rcu_ptr_callback); return; } synchronize_rcu(); kvfree(to_free); } int ext4_resize_begin(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); int ret = 0; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; /* * If the reserved GDT blocks is non-zero, the resize_inode feature * should always be set. */ if (sbi->s_es->s_reserved_gdt_blocks && !ext4_has_feature_resize_inode(sb)) { ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); return -EFSCORRUPTED; } /* * If we are not using the primary superblock/GDT copy don't resize, * because the user tools have no way of handling this. Probably a * bad time to do it anyways. */ if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != le32_to_cpu(sbi->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", (unsigned long long)sbi->s_sbh->b_blocknr); return -EPERM; } /* * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ if (sbi->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " "so online resizing is not allowed"); return -EPERM; } if (ext4_has_feature_sparse_super2(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); return -EOPNOTSUPP; } if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags)) ret = -EBUSY; return ret; } int ext4_resize_end(struct super_block *sb, bool update_backups) { clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags); smp_mb__after_atomic(); if (update_backups) return ext4_update_overhead(sb, true); return 0; } static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, ext4_group_t group) { ext4_grpblk_t overhead; overhead = ext4_bg_num_gdb(sb, group); if (ext4_bg_has_super(sb, group)) overhead += 1 + le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); return overhead; } #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) static int verify_group_input(struct super_block *sb, struct ext4_new_group_data *input) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t start = ext4_blocks_count(es); ext4_fsblk_t end = start + input->blocks_count; ext4_group_t group = input->group; ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; unsigned overhead; ext4_fsblk_t metaend; struct buffer_head *bh = NULL; ext4_grpblk_t free_blocks_count, offset; int err = -EINVAL; if (group != sbi->s_groups_count) { ext4_warning(sb, "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); return -EINVAL; } overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; free_blocks_count = input->blocks_count - 2 - overhead - sbi->s_itb_per_group; input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " "(%d free, %u reserved)\n", ext4_bg_has_super(sb, input->group) ? "normal" : "no-super", input->group, input->blocks_count, free_blocks_count, input->reserved_blocks); ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (offset != 0) ext4_warning(sb, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) ext4_warning(sb, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) ext4_warning(sb, "Bad blocks count %u", input->blocks_count); else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) { err = PTR_ERR(bh); bh = NULL; ext4_warning(sb, "Cannot read last block (%llu)", end - 1); } else if (outside(input->block_bitmap, start, end)) ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) ext4_warning(sb, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) ext4_warning(sb, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) ext4_warning(sb, "Block bitmap same as inode bitmap (%llu)", (unsigned long long)input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) ext4_warning(sb, "Block bitmap (%llu) in inode table " "(%llu-%llu)", (unsigned long long)input->block_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->inode_bitmap, input->inode_table, itend)) ext4_warning(sb, "Inode bitmap (%llu) in inode table " "(%llu-%llu)", (unsigned long long)input->inode_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->block_bitmap, start, metaend)) ext4_warning(sb, "Block bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) ext4_warning(sb, "Inode bitmap (%llu) in GDT table (%llu-%llu)", (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) ext4_warning(sb, "Inode table (%llu-%llu) overlaps GDT table " "(%llu-%llu)", (unsigned long long)input->inode_table, itend - 1, start, metaend - 1); else err = 0; brelse(bh); return err; } /* * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex * group each time. */ struct ext4_new_flex_group_data { struct ext4_new_group_data *groups; /* new_group_data for groups in the flex group */ __u16 *bg_flags; /* block group flags of groups in @groups */ ext4_group_t resize_bg; /* number of allocated new_group_data */ ext4_group_t count; /* number of groups in @groups */ }; /* * Avoiding memory allocation failures due to too many groups added each time. */ #define MAX_RESIZE_BG 16384 /* * alloc_flex_gd() allocates an ext4_new_flex_group_data that satisfies the * resizing from @o_group to @n_group, its size is typically @flexbg_size. * * Returns NULL on failure otherwise address of the allocated structure. */ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size, ext4_group_t o_group, ext4_group_t n_group) { ext4_group_t last_group; unsigned int max_resize_bg; struct ext4_new_flex_group_data *flex_gd; flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS); if (flex_gd == NULL) goto out3; max_resize_bg = umin(flexbg_size, MAX_RESIZE_BG); flex_gd->resize_bg = max_resize_bg; /* Avoid allocating large 'groups' array if not needed */ last_group = o_group | (flex_gd->resize_bg - 1); if (n_group <= last_group) flex_gd->resize_bg = 1 << fls(n_group - o_group); else if (n_group - last_group < flex_gd->resize_bg) flex_gd->resize_bg = 1 << max(fls(last_group - o_group), fls(n_group - last_group)); if (WARN_ON_ONCE(flex_gd->resize_bg > max_resize_bg)) flex_gd->resize_bg = max_resize_bg; flex_gd->groups = kmalloc_array(flex_gd->resize_bg, sizeof(struct ext4_new_group_data), GFP_NOFS); if (flex_gd->groups == NULL) goto out2; flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16), GFP_NOFS); if (flex_gd->bg_flags == NULL) goto out1; return flex_gd; out1: kfree(flex_gd->groups); out2: kfree(flex_gd); out3: return NULL; } static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd) { kfree(flex_gd->bg_flags); kfree(flex_gd->groups); kfree(flex_gd); } /* * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps * and inode tables for a flex group. * * This function is used by 64bit-resize. Note that this function allocates * group tables from the 1st group of groups contained by @flexgd, which may * be a partial of a flex group. * * @sb: super block of fs to which the groups belongs * * Returns 0 on a successful allocation of the metadata blocks in the * block group. */ static int ext4_alloc_group_tables(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, unsigned int flexbg_size) { struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t start_blk; ext4_fsblk_t last_blk; ext4_group_t src_group; ext4_group_t bb_index = 0; ext4_group_t ib_index = 0; ext4_group_t it_index = 0; ext4_group_t group; ext4_group_t last_group; unsigned overhead; __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; int i; BUG_ON(flex_gd->count == 0 || group_data == NULL); src_group = group_data[0].group; last_group = src_group + flex_gd->count - 1; BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) != (last_group & ~(flexbg_size - 1)))); next_group: group = group_data[0].group; if (src_group >= group_data[0].group + flex_gd->count) return -ENOSPC; start_blk = ext4_group_first_block_no(sb, src_group); last_blk = start_blk + group_data[src_group - group].blocks_count; overhead = ext4_group_overhead_blocks(sb, src_group); start_blk += overhead; /* We collect contiguous blocks as much as possible. */ src_group++; for (; src_group <= last_group; src_group++) { overhead = ext4_group_overhead_blocks(sb, src_group); if (overhead == 0) last_blk += group_data[src_group - group].blocks_count; else break; } /* Allocate block bitmaps */ for (; bb_index < flex_gd->count; bb_index++) { if (start_blk >= last_blk) goto next_group; group_data[bb_index].block_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode bitmaps */ for (; ib_index < flex_gd->count; ib_index++) { if (start_blk >= last_blk) goto next_group; group_data[ib_index].inode_bitmap = start_blk++; group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].mdata_blocks++; flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode tables */ for (; it_index < flex_gd->count; it_index++) { unsigned int itb = EXT4_SB(sb)->s_itb_per_group; ext4_fsblk_t next_group_start; if (start_blk + itb > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; group = ext4_get_group_number(sb, start_blk); next_group_start = ext4_group_first_block_no(sb, group + 1); group -= group_data[0].group; if (start_blk + itb > next_group_start) { flex_gd->bg_flags[group + 1] &= uninit_mask; overhead = start_blk + itb - next_group_start; group_data[group + 1].mdata_blocks += overhead; itb -= overhead; } group_data[group].mdata_blocks += itb; flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } /* Update free clusters count to exclude metadata blocks */ for (i = 0; i < flex_gd->count; i++) { group_data[i].free_clusters_count -= EXT4_NUM_B2C(EXT4_SB(sb), group_data[i].mdata_blocks); } if (test_opt(sb, DEBUG)) { int i; group = group_data[0].group; printk(KERN_DEBUG "EXT4-fs: adding a flex group with " "%u groups, flexbg size is %u:\n", flex_gd->count, flexbg_size); for (i = 0; i < flex_gd->count; i++) { ext4_debug( "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n", ext4_bg_has_super(sb, group + i) ? "normal" : "no-super", group + i, group_data[i].blocks_count, group_data[i].free_clusters_count, group_data[i].mdata_blocks); } } return 0; } static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, ext4_fsblk_t blk) { struct buffer_head *bh; int err; bh = sb_getblk(sb, blk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); bh = ERR_PTR(err); } else { memset(bh->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bh); } return bh; } static int ext4_resize_ensure_credits_batch(handle_t *handle, int credits) { return ext4_journal_ensure_credits_fn(handle, credits, EXT4_MAX_TRANS_DATA, 0, 0); } /* * set_flexbg_block_bitmap() mark clusters [@first_cluster, @last_cluster] used. * * Helper function for ext4_setup_new_group_blocks() which set . * * @sb: super block * @handle: journal handle * @flex_gd: flex group data */ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, struct ext4_new_flex_group_data *flex_gd, ext4_fsblk_t first_cluster, ext4_fsblk_t last_cluster) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t count = last_cluster - first_cluster + 1; ext4_group_t count2; ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, last_cluster); for (; count > 0; count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; int err; group = ext4_get_group_number(sb, EXT4_C2B(sbi, first_cluster)); start = EXT4_B2C(sbi, ext4_group_first_block_no(sb, group)); group -= flex_gd->groups[0].group; count2 = EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start); if (count2 > count) count2 = count; if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) { BUG_ON(flex_gd->count > 1); continue; } err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) return err; bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap); if (unlikely(!bh)) return -ENOMEM; BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; } ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", first_cluster, first_cluster - start, count2); mb_set_bits(bh->b_data, first_cluster - start, count2); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); if (unlikely(err)) return err; } return 0; } /* * Set up the block and inode bitmaps, and the inode table for the new groups. * This doesn't need to be part of the main transaction, since we are only * changing blocks outside the actual filesystem. We still do journaling to * ensure the recovery is correct in case of a failure just after resize. * If any part of this fails, we simply abort the resize. * * setup_new_flex_group_blocks handles a flex group as follow: * 1. copy super block and GDT, and initialize group tables if necessary. * In this step, we only set bits in blocks bitmaps for blocks taken by * super block and GDT. * 2. allocate group tables in block bitmaps, that is, set bits in block * bitmap for blocks taken by group tables. */ static int setup_new_flex_group_blocks(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd) { int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group}; ext4_fsblk_t start; ext4_fsblk_t block; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct ext4_new_group_data *group_data = flex_gd->groups; __u16 *bg_flags = flex_gd->bg_flags; handle_t *handle; ext4_group_t group, count; struct buffer_head *bh = NULL; int reserved_gdb, i, j, err = 0, err2; int meta_bg; BUG_ON(!flex_gd->count || !group_data || group_data[0].group != sbi->s_groups_count); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); meta_bg = ext4_has_feature_meta_bg(sb); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) return PTR_ERR(handle); group = group_data[0].group; for (i = 0; i < flex_gd->count; i++, group++) { unsigned long gdblocks; ext4_grpblk_t overhead; gdblocks = ext4_bg_num_gdb(sb, group); start = ext4_group_first_block_no(sb, group); if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) goto handle_itb; if (meta_bg == 1) goto handle_itb; block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ for (j = 0; j < gdblocks; j++, block++) { struct buffer_head *gdb; ext4_debug("update backup group %#04llx\n", block); err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) goto out; gdb = sb_getblk(sb, block); if (unlikely(!gdb)) { err = -ENOMEM; goto out; } BUFFER_TRACE(gdb, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb, EXT4_JTR_NONE); if (err) { brelse(gdb); goto out; } memcpy(gdb->b_data, sbi_array_rcu_deref(sbi, s_group_desc, j)->b_data, gdb->b_size); set_buffer_uptodate(gdb); err = ext4_handle_dirty_metadata(handle, NULL, gdb); if (unlikely(err)) { brelse(gdb); goto out; } brelse(gdb); } /* Zero out all of the reserved backup group descriptor * table blocks */ if (ext4_bg_has_super(sb, group)) { err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); if (err) goto out; } handle_itb: /* Initialize group tables of the group @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; /* Zero out all of the inode table blocks */ block = group_data[i].inode_table; ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", block, sbi->s_itb_per_group); err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto out; handle_bb: if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT) goto handle_ib; /* Initialize block bitmap of the @group */ block = group_data[i].block_bitmap; err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) goto out; bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); goto out; } overhead = ext4_group_overhead_blocks(sb, group); if (overhead != 0) { ext4_debug("mark backup superblock %#04llx (+0)\n", start); mb_set_bits(bh->b_data, 0, EXT4_NUM_B2C(sbi, overhead)); } ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); if (err) goto out; handle_ib: if (bg_flags[i] & EXT4_BG_INODE_UNINIT) continue; /* Initialize inode bitmap of the @group */ block = group_data[i].inode_bitmap; err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) goto out; /* Mark unused entries in inode bitmap used */ bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); goto out; } ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); if (err) goto out; } /* Mark group tables in block bitmap */ for (j = 0; j < GROUP_TABLE_COUNT; j++) { count = group_table_count[j]; start = (&group_data[0].block_bitmap)[j]; block = start; for (i = 1; i < flex_gd->count; i++) { block += group_table_count[j]; if (block == (&group_data[i].block_bitmap)[j]) { count += group_table_count[j]; continue; } err = set_flexbg_block_bitmap(sb, handle, flex_gd, EXT4_B2C(sbi, start), EXT4_B2C(sbi, start + count - 1)); if (err) goto out; count = group_table_count[j]; start = (&group_data[i].block_bitmap)[j]; block = start; } err = set_flexbg_block_bitmap(sb, handle, flex_gd, EXT4_B2C(sbi, start), EXT4_B2C(sbi, start + count - 1)); if (err) goto out; } out: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; return err; } /* * Iterate through the groups which hold BACKUP superblock/GDT copies in an * ext4 filesystem. The counters should be initialized to 1, 5, and 7 before * calling this for the first time. In a sparse filesystem it will be the * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... */ unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, unsigned int *five, unsigned int *seven) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned int *min = three; int mult = 3; unsigned int ret; if (ext4_has_feature_sparse_super2(sb)) { do { if (*min > 2) return UINT_MAX; ret = le32_to_cpu(es->s_backup_bgs[*min - 1]); *min += 1; } while (!ret); return ret; } if (!ext4_has_feature_sparse_super(sb)) { ret = *min; *min += 1; return ret; } if (*five < *min) { min = five; mult = 5; } if (*seven < *min) { min = seven; mult = 7; } ret = *min; *min *= mult; return ret; } /* * Check that all of the backup GDT blocks are held in the primary GDT block. * It is assumed that they are stored in group order. Returns the number of * groups in current filesystem that have BACKUPS, or -ve error code. */ static int verify_reserved_gdb(struct super_block *sb, ext4_group_t end, struct buffer_head *primary) { const ext4_fsblk_t blk = primary->b_blocknr; unsigned three = 1; unsigned five = 5; unsigned seven = 7; unsigned grp; __le32 *p = (__le32 *)primary->b_data; int gdbackups = 0; while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ ext4_warning(sb, "reserved GDT %llu" " missing grp %d (%llu)", blk, grp, grp * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + blk); return -EINVAL; } if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb)) return -EFBIG; } return gdbackups; } /* * Called when we need to bring a reserved group descriptor table block into * use from the resize inode. The primary copy of the new GDT block currently * is an indirect block (under the double indirect block in the resize inode). * The new backup GDT blocks will be stored as leaf blocks in this indirect * block, in group order. Even though we know all the block numbers we need, * we check to ensure that the resize inode has actually reserved these blocks. * * Don't need to update the block bitmaps because the blocks are still in use. * * We get all of the error cases out of the way, so that we are sure to not * fail once we start modifying the data on disk, because JBD has no rollback. */ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_group_t group) { struct super_block *sb = inode->i_sb; struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; struct buffer_head **o_group_desc, **n_group_desc = NULL; struct buffer_head *dind = NULL; struct buffer_head *gdb_bh = NULL; int gdbackups; struct ext4_iloc iloc = { .bh = NULL }; __le32 *data; int err; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; goto errout; } data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); if (IS_ERR(dind)) { err = PTR_ERR(dind); dind = NULL; goto errout; } data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { ext4_warning(sb, "new group %u GDT block %llu not reserved", group, gdblock); err = -EINVAL; goto errout; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE); if (unlikely(err)) { ext4_std_error(sb, err); goto errout; } /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) goto errout; n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_KERNEL); if (!n_group_desc) { err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); goto errout; } /* * Finally, we have all of the possible failures behind us... * * Remove new GDT block from inode double-indirect block and clear out * the new GDT block for use (which also "frees" the backup GDT blocks * from the reserved inode). We don't need to change the bitmaps for * these blocks, because they are marked as in-use from being in the * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; err = ext4_handle_dirty_metadata(handle, NULL, dind); if (unlikely(err)) { ext4_std_error(sb, err); goto errout; } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> (9 - EXT4_SB(sb)->s_cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); memset(gdb_bh->b_data, 0, sb->s_blocksize); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); iloc.bh = NULL; goto errout; } brelse(dind); rcu_read_lock(); o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); rcu_read_unlock(); n_group_desc[gdb_num] = gdb_bh; rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); EXT4_SB(sb)->s_gdb_count++; ext4_kvfree_array_rcu(o_group_desc); lock_buffer(EXT4_SB(sb)->s_sbh); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); if (err) ext4_std_error(sb, err); return err; errout: kvfree(n_group_desc); brelse(iloc.bh); brelse(dind); brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); return err; } /* * If there is no available space in the existing block group descriptors for * the new block group and there are no reserved block group descriptors, then * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set * to the first block group that is managed using meta_bg and s_first_meta_bg * must be a multiple of EXT4_DESC_PER_BLOCK(sb). * This function will be called when first group of meta_bg is added to bring * new group descriptors block of new added meta_bg. */ static int add_new_gdb_meta_bg(struct super_block *sb, handle_t *handle, ext4_group_t group) { ext4_fsblk_t gdblock; struct buffer_head *gdb_bh; struct buffer_head **o_group_desc, **n_group_desc; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int err; gdblock = ext4_group_first_block_no(sb, group) + ext4_bg_has_super(sb, group); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); n_group_desc = kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_KERNEL); if (!n_group_desc) { brelse(gdb_bh); err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); return err; } rcu_read_lock(); o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); rcu_read_unlock(); n_group_desc[gdb_num] = gdb_bh; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (err) { kvfree(n_group_desc); brelse(gdb_bh); return err; } rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); EXT4_SB(sb)->s_gdb_count++; ext4_kvfree_array_rcu(o_group_desc); return err; } /* * Called when we are adding a new group which has a backup copy of each of * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. * We need to add these reserved backup GDT blocks to the resize inode, so * that they are kept for future resizing and not allocated to files. * * Each reserved backup GDT block will go into a different indirect block. * The indirect blocks are actually the primary reserved GDT blocks, * so we know in advance what their block numbers are. We only get the * double-indirect block to verify it is pointing to the primary reserved * GDT blocks so we don't overwrite a data block by accident. The reserved * backup GDT blocks are stored in their reserved primary GDT block. */ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, ext4_group_t group) { struct super_block *sb = inode->i_sb; int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); int cluster_bits = EXT4_SB(sb)->s_cluster_bits; struct buffer_head **primary; struct buffer_head *dind; struct ext4_iloc iloc; ext4_fsblk_t blk; __le32 *data, *end; int gdbackups = 0; int res, i; int err; primary = kmalloc_array(reserved_gdb, sizeof(*primary), GFP_NOFS); if (!primary) return -ENOMEM; data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); if (IS_ERR(dind)) { err = PTR_ERR(dind); dind = NULL; goto exit_free; } blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count; data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count % EXT4_ADDR_PER_BLOCK(sb)); end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb); /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { ext4_warning(sb, "reserved block %llu" " not at offset %ld", blk, (long)(data - (__le32 *)dind->b_data)); err = -EINVAL; goto exit_bh; } primary[res] = ext4_sb_bread(sb, blk, 0); if (IS_ERR(primary[res])) { err = PTR_ERR(primary[res]); primary[res] = NULL; goto exit_bh; } gdbackups = verify_reserved_gdb(sb, group, primary[res]); if (gdbackups < 0) { brelse(primary[res]); err = gdbackups; goto exit_bh; } if (++data >= end) data = (__le32 *)dind->b_data; } for (i = 0; i < reserved_gdb; i++) { BUFFER_TRACE(primary[i], "get_write_access"); if ((err = ext4_journal_get_write_access(handle, sb, primary[i], EXT4_JTR_NONE))) goto exit_bh; } if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) goto exit_bh; /* * Finally we can add each of the reserved backup GDT blocks from * the new group to its reserved primary GDT block. */ blk = group * EXT4_BLOCKS_PER_GROUP(sb); for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) err = err2; } inode->i_blocks += reserved_gdb * sb->s_blocksize >> (9 - cluster_bits); ext4_mark_iloc_dirty(handle, inode, &iloc); exit_bh: while (--res >= 0) brelse(primary[res]); brelse(dind); exit_free: kfree(primary); return err; } static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, ext4_group_t group) { struct ext4_super_block *es = (struct ext4_super_block *) data; es->s_block_group_nr = cpu_to_le16(group); if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(sb, es); } /* * Update the backup copies of the ext4 metadata. These don't need to be part * of the main resize transaction, because e2fsck will re-write them if there * is a problem (basically only OOM will cause a problem). However, we * _should_ update the backups if possible, in case the primary gets trashed * for some reason and we need to run e2fsck from a backup superblock. The * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * * We do not need take the s_resize_lock for this, because these * blocks are not otherwise touched by the filesystem code when it is * mounted. We don't need to worry about last changing from * sbi->s_groups_count, because the worst that can happen is that we * do not copy the full number of backups at this time. The resize * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, int size, int meta_bg) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t last; const int bpg = EXT4_BLOCKS_PER_GROUP(sb); unsigned three = 1; unsigned five = 5; unsigned seven = 7; ext4_group_t group = 0; int rest = sb->s_blocksize - size; handle_t *handle; int err = 0, err2; handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); if (IS_ERR(handle)) { group = 1; err = PTR_ERR(handle); goto exit_err; } if (meta_bg == 0) { group = ext4_list_backups(sb, &three, &five, &seven); last = sbi->s_groups_count; } else { group = ext4_get_group_number(sb, blk_off) + 1; last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); } while (group < sbi->s_groups_count) { struct buffer_head *bh; ext4_fsblk_t backup_block; int has_super = ext4_bg_has_super(sb, group); ext4_fsblk_t first_block = ext4_group_first_block_no(sb, group); /* Out of journal space, and can't get more - abort - so sad */ err = ext4_resize_ensure_credits_batch(handle, 1); if (err < 0) break; if (meta_bg == 0) backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; else backup_block = first_block + has_super; bh = sb_getblk(sb, backup_block); if (unlikely(!bh)) { err = -ENOMEM; break; } ext4_debug("update metadata backup %llu(+%llu)\n", backup_block, backup_block - ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE))) { brelse(bh); break; } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) memset(bh->b_data + size, 0, rest); if (has_super && (backup_block == first_block)) ext4_set_block_group_nr(sb, bh->b_data, group); set_buffer_uptodate(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (unlikely(err)) ext4_std_error(sb, err); brelse(bh); if (meta_bg == 0) group = ext4_list_backups(sb, &three, &five, &seven); else if (group == last) break; else group = last; } if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; /* * Ugh! Need to have e2fsck write the backup copies. It is too * late to revert the resize, we shouldn't fail just because of * the backup copies (they are only needed in case of corruption). * * However, if we got here we have a journal problem too, so we * can't really start a transaction to mark the superblock. * Chicken out and just set the flag on the hope it will be written * to disk, and if not - we will simply wait until next fsck. */ exit_err: if (err) { ext4_warning(sb, "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); mark_buffer_dirty(sbi->s_sbh); } } /* * ext4_add_new_descs() adds @count group descriptor of groups * starting at @group * * @handle: journal handle * @sb: super block * @group: the group no. of the first group desc to be added * @resize_inode: the resize inode * @count: number of group descriptors to be added */ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, ext4_group_t group, struct inode *resize_inode, ext4_group_t count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *gdb_bh; int i, gdb_off, gdb_num, err = 0; int meta_bg; meta_bg = ext4_has_feature_meta_bg(sb); for (i = 0; i < count; i++, group++) { int reserved_gdb = ext4_bg_has_super(sb, group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; gdb_off = group % EXT4_DESC_PER_BLOCK(sb); gdb_num = group / EXT4_DESC_PER_BLOCK(sb); /* * We will only either add reserved group blocks to a backup group * or remove reserved blocks for the first group in a new group block. * Doing both would be mean more complex code, and sane people don't * use non-sparse filesystems anymore. This is already checked above. */ if (gdb_off) { gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); } else if (meta_bg != 0) { err = add_new_gdb_meta_bg(sb, handle, group); } else { err = add_new_gdb(handle, resize_inode, group); } if (err) break; } return err; } static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) { struct buffer_head *bh = sb_getblk(sb, block); if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { if (ext4_read_bh(bh, 0, NULL, false) < 0) { brelse(bh); return NULL; } } return bh; } static int ext4_set_bitmap_checksums(struct super_block *sb, struct ext4_group_desc *gdp, struct ext4_new_group_data *group_data) { struct buffer_head *bh; if (!ext4_has_feature_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; ext4_inode_bitmap_csum_set(sb, gdp, bh); brelse(bh); bh = ext4_get_bitmap(sb, group_data->block_bitmap); if (!bh) return -EIO; ext4_block_bitmap_csum_set(sb, gdp, bh); brelse(bh); return 0; } /* * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg */ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, struct ext4_new_flex_group_data *flex_gd) { struct ext4_new_group_data *group_data = flex_gd->groups; struct ext4_group_desc *gdp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *gdb_bh; ext4_group_t group; __u16 *bg_flags = flex_gd->bg_flags; int i, gdb_off, gdb_num, err = 0; for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) { group = group_data->group; gdb_off = group % EXT4_DESC_PER_BLOCK(sb); gdb_num = group / EXT4_DESC_PER_BLOCK(sb); /* * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). */ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)(gdb_bh->b_data + gdb_off * EXT4_DESC_SIZE(sb)); memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); err = ext4_set_bitmap_checksums(sb, gdp, group_data); if (err) { ext4_std_error(sb, err); break; } ext4_inode_table_set(sb, gdp, group_data->inode_table); ext4_free_group_clusters_set(sb, gdp, group_data->free_clusters_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); if (ext4_has_group_desc_csum(sb)) ext4_itable_unused_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); gdp->bg_flags = cpu_to_le16(*bg_flags); ext4_group_desc_csum_set(sb, group, gdp); err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); break; } /* * We can allocate memory for mb_alloc based on the new group * descriptor */ err = ext4_mb_add_groupinfo(sb, group, gdp); if (err) break; } return err; } static void ext4_add_overhead(struct super_block *sb, const ext4_fsblk_t overhead) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; sbi->s_overhead += overhead; es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); smp_wmb(); } /* * ext4_update_super() updates the super block so that the newly added * groups can be seen by the filesystem. * * @sb: super block * @flex_gd: new added groups */ static void ext4_update_super(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd) { ext4_fsblk_t blocks_count = 0; ext4_fsblk_t free_blocks = 0; ext4_fsblk_t reserved_blocks = 0; struct ext4_new_group_data *group_data = flex_gd->groups; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int i; BUG_ON(flex_gd->count == 0 || group_data == NULL); /* * Make the new blocks and inodes valid next. We do this before * increasing the group count so that once the group is enabled, * all of its blocks and inodes are already valid. * * We always allocate group-by-group, then block-by-block or * inode-by-inode within a group, so enabling these * blocks/inodes before the group is live won't actually let us * allocate the new space yet. */ for (i = 0; i < flex_gd->count; i++) { blocks_count += group_data[i].blocks_count; free_blocks += EXT4_C2B(sbi, group_data[i].free_clusters_count); } reserved_blocks = ext4_r_blocks_count(es) * 100; reserved_blocks = div64_u64(reserved_blocks, ext4_blocks_count(es)); reserved_blocks *= blocks_count; do_div(reserved_blocks, 100); lock_buffer(sbi->s_sbh); ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); ext4_debug("free blocks count %llu", ext4_free_blocks_count(es)); /* * We need to protect s_groups_count against other CPUs seeing * inconsistent state in the superblock. * * The precise rules we use are: * * * Writers must perform a smp_wmb() after updating all * dependent data and before modifying the groups count * * * Readers must perform an smp_rmb() after reading the groups * count and before reading any dependent data. * * NB. These rules can be relaxed when checking the group count * while freeing data, as we can only allocate from a block * group after serialising against the group count, and we can * only then free after serialising in turn against that * allocation. */ smp_wmb(); /* Update the global fs size fields */ sbi->s_groups_count += flex_gd->count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); /* Update the reserved block counts only once the new group is * active. */ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + reserved_blocks); /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, EXT4_NUM_B2C(sbi, free_blocks)); percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb) * flex_gd->count); ext4_debug("free blocks count %llu", percpu_counter_read(&sbi->s_freeclusters_counter)); if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) { ext4_group_t flex_group; struct flex_groups *fg; flex_group = ext4_flex_group(sbi, group_data[0].group); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), &fg->free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &fg->free_inodes); } /* * Update the fs overhead information. * * For bigalloc, if the superblock already has a properly calculated * overhead, update it with a value based on numbers already computed * above for the newly allocated capacity. */ if (ext4_has_feature_bigalloc(sb) && (sbi->s_overhead != 0)) ext4_add_overhead(sb, EXT4_NUM_B2C(sbi, blocks_count - free_blocks)); else ext4_calculate_overhead(sb); es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" "%llu blocks(%llu free %llu reserved)\n", flex_gd->count, blocks_count, free_blocks, reserved_blocks); } /* Add a flex group to an fs. Ensure we handle all possible error conditions * _before_ we start modifying the filesystem, because we cannot abort the * transaction and not have it write the data to disk. */ static int ext4_flex_group_add(struct super_block *sb, struct inode *resize_inode, struct ext4_new_flex_group_data *flex_gd) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t o_blocks_count; ext4_grpblk_t last; ext4_group_t group; handle_t *handle; unsigned reserved_gdb; int err = 0, err2 = 0, credit; BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); o_blocks_count = ext4_blocks_count(es); ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); BUG_ON(last); err = setup_new_flex_group_blocks(sb, flex_gd); if (err) goto exit; /* * We will always be modifying at least the superblock and GDT * blocks. If we are adding a group past the last current GDT block, * we will also modify the inode and the dindirect block. If we * are adding a group with superblock/GDT backups we will also * modify each of the reserved GDT dindirect blocks. */ credit = 3; /* sb, resize inode, resize inode dindirect */ /* GDT blocks */ credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); credit += reserved_gdb; /* Reserved GDT dindirect blocks */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto exit; } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto exit_journal; group = flex_gd->groups[0].group; BUG_ON(group != sbi->s_groups_count); err = ext4_add_new_descs(handle, sb, group, resize_inode, flex_gd->count); if (err) goto exit_journal; err = ext4_setup_new_descs(handle, sb, flex_gd); if (err) goto exit_journal; ext4_update_super(sb, flex_gd); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); exit_journal: err2 = ext4_journal_stop(handle); if (!err) err = err2; if (!err) { int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); int meta_bg = ext4_has_feature_meta_bg(sb) && gdb_num >= le32_to_cpu(es->s_first_meta_bg); sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - ext4_group_first_block_no(sb, 0); update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); for (; gdb_num <= gdb_num_end; gdb_num++) { struct buffer_head *gdb_bh; gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); update_backups(sb, gdb_bh->b_blocknr - padding_blocks, gdb_bh->b_data, gdb_bh->b_size, meta_bg); } } exit: return err; } static int ext4_setup_next_flex_gd(struct super_block *sb, struct ext4_new_flex_group_data *flex_gd, ext4_fsblk_t n_blocks_count) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct ext4_new_group_data *group_data = flex_gd->groups; ext4_fsblk_t o_blocks_count; ext4_group_t n_group; ext4_group_t group; ext4_group_t last_group; ext4_grpblk_t last; ext4_grpblk_t clusters_per_group; unsigned long i; clusters_per_group = EXT4_CLUSTERS_PER_GROUP(sb); o_blocks_count = ext4_blocks_count(es); if (o_blocks_count == n_blocks_count) return 0; ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); BUG_ON(last); ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last); last_group = group | (flex_gd->resize_bg - 1); if (last_group > n_group) last_group = n_group; flex_gd->count = last_group - group + 1; for (i = 0; i < flex_gd->count; i++) { int overhead; group_data[i].group = group + i; group_data[i].blocks_count = EXT4_BLOCKS_PER_GROUP(sb); overhead = ext4_group_overhead_blocks(sb, group + i); group_data[i].mdata_blocks = overhead; group_data[i].free_clusters_count = EXT4_CLUSTERS_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) { flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT | EXT4_BG_INODE_UNINIT; if (!test_opt(sb, INIT_INODE_TABLE)) flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED; } else flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED; } if (last_group == n_group && ext4_has_group_desc_csum(sb)) /* We need to initialize block bitmap of last group. */ flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT; if ((last_group == n_group) && (last != clusters_per_group - 1)) { group_data[i - 1].blocks_count = EXT4_C2B(sbi, last + 1); group_data[i - 1].free_clusters_count -= clusters_per_group - last - 1; } return 1; } /* Add group descriptor data to an existing or new group descriptor block. * Ensure we handle all possible error conditions _before_ we start modifying * the filesystem, because we cannot abort the transaction and not have it * write the data to disk. * * If we are on a GDT block boundary, we need to get the reserved GDT block. * Otherwise, we may need to add backup GDT blocks for a sparse group. * * We only need to hold the superblock lock while we are actually adding * in the new group's counts to the superblock. Prior to that we have * not really "added" the group at all. We re-check that we are still * adding in the last group in case things have changed since verifying. */ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) { struct ext4_new_flex_group_data flex_gd; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int reserved_gdb = ext4_bg_has_super(sb, input->group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; struct inode *inode = NULL; int gdb_off; int err; __u16 bg_flags = 0; gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) { ext4_warning(sb, "Can't resize non-sparse filesystem further"); return -EPERM; } if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { ext4_warning(sb, "inodes_count overflow"); return -EINVAL; } if (reserved_gdb || gdb_off == 0) { if (!ext4_has_feature_resize_inode(sb) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); } } err = verify_group_input(sb, input); if (err) goto out; err = ext4_alloc_flex_bg_array(sb, input->group + 1); if (err) goto out; err = ext4_mb_alloc_groupinfo(sb, input->group + 1); if (err) goto out; flex_gd.count = 1; flex_gd.groups = input; flex_gd.bg_flags = &bg_flags; err = ext4_flex_group_add(sb, inode, &flex_gd); out: iput(inode); return err; } /* ext4_group_add */ /* * extend a group without checking assuming that checking has been done. */ static int ext4_group_extend_no_check(struct super_block *sb, ext4_fsblk_t o_blocks_count, ext4_grpblk_t add) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; handle_t *handle; int err = 0, err2; /* We will update the superblock, one block bitmap, and * one group descriptor via ext4_group_add_blocks(). */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_warning(sb, "error %d on journal start", err); return err; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) { ext4_warning(sb, "error %d on journal write access", err); goto errout; } lock_buffer(EXT4_SB(sb)->s_sbh); ext4_blocks_count_set(es, o_blocks_count + add); ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); if (err) goto errout; ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); errout: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; if (!err) { if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu " "blocks\n", ext4_blocks_count(es)); update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); } return err; } /* * Extend the filesystem to the new number of blocks specified. This entry * point is only used to extend the current filesystem to the end of the last * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" * for emergencies (because it has no dependencies on reserved blocks). * * If we _really_ wanted, we could use default values to call ext4_group_add() * allow the "remount" trick to work for arbitrary resizing, assuming enough * GDT blocks are reserved to grow to the desired size. */ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count) { ext4_fsblk_t o_blocks_count; ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; ext4_group_t group; o_blocks_count = ext4_blocks_count(es); if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, "extending last group from %llu to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) return 0; if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ext4_msg(sb, KERN_ERR, "filesystem too large to resize to %llu blocks safely", n_blocks_count); return -EINVAL; } if (n_blocks_count < o_blocks_count) { ext4_warning(sb, "can't shrink FS - resize aborted"); return -EINVAL; } /* Handle the remaining blocks in the last group only. */ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); if (last == 0) { ext4_warning(sb, "need to use ext2online to resize further"); return -EPERM; } add = EXT4_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { ext4_warning(sb, "blocks_count overflow"); return -EINVAL; } if (o_blocks_count + add > n_blocks_count) add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) ext4_warning(sb, "will only finish group (%llu blocks, %u new)", o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ bh = ext4_sb_bread(sb, o_blocks_count + add - 1, 0); if (IS_ERR(bh)) { ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } brelse(bh); return ext4_group_extend_no_check(sb, o_blocks_count, add); } /* ext4_group_extend */ static int num_desc_blocks(struct super_block *sb, ext4_group_t groups) { return (groups + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); } /* * Release the resize inode and drop the resize_inode feature if there * are no more reserved gdt blocks, and then convert the file system * to enable meta_bg */ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) { handle_t *handle; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct ext4_inode_info *ei = EXT4_I(inode); ext4_fsblk_t nr; int i, ret, err = 0; int credits = 1; ext4_msg(sb, KERN_INFO, "Converting file system to meta_bg"); if (inode) { if (es->s_reserved_gdt_blocks) { ext4_error(sb, "Unexpected non-zero " "s_reserved_gdt_blocks"); return -EPERM; } /* Do a quick sanity check of the resize inode */ if (inode->i_blocks != 1 << (inode->i_blkbits - (9 - sbi->s_cluster_bits))) goto invalid_resize_inode; for (i = 0; i < EXT4_N_BLOCKS; i++) { if (i == EXT4_DIND_BLOCK) { if (ei->i_data[i]) continue; else goto invalid_resize_inode; } if (ei->i_data[i]) goto invalid_resize_inode; } credits += 3; /* block bitmap, bg descriptor, resize inode */ } handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credits); if (IS_ERR(handle)) return PTR_ERR(handle); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto errout; lock_buffer(sbi->s_sbh); ext4_clear_feature_resize_inode(sb); ext4_set_feature_meta_bg(sb); sbi->s_es->s_first_meta_bg = cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); if (err) { ext4_std_error(sb, err); goto errout; } if (inode) { nr = le32_to_cpu(ei->i_data[EXT4_DIND_BLOCK]); ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); ei->i_data[EXT4_DIND_BLOCK] = 0; inode->i_blocks = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) ext4_std_error(sb, err); } errout: ret = ext4_journal_stop(handle); return err ? err : ret; invalid_resize_inode: ext4_error(sb, "corrupted/inconsistent resize inode"); return -EINVAL; } /* * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count * * @sb: super block of the fs to be resized * @n_blocks_count: the number of blocks resides in the resized fs */ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) { struct ext4_new_flex_group_data *flex_gd = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *bh; struct inode *resize_inode = NULL; ext4_grpblk_t add, offset; unsigned long n_desc_blocks; unsigned long o_desc_blocks; ext4_group_t o_group; ext4_group_t n_group; ext4_fsblk_t o_blocks_count; ext4_fsblk_t n_blocks_count_retry = 0; unsigned long last_update_time = 0; int err = 0; int meta_bg; unsigned int flexbg_size = ext4_flex_bg_size(sbi); /* See if the device is actually as big as what was requested */ bh = ext4_sb_bread(sb, n_blocks_count - 1, 0); if (IS_ERR(bh)) { ext4_warning(sb, "can't read last block, resize aborted"); return -ENOSPC; } brelse(bh); /* * For bigalloc, trim the requested size to the nearest cluster * boundary to avoid creating an unusable filesystem. We do this * silently, instead of returning an error, to avoid breaking * callers that blindly resize the filesystem to the full size of * the underlying block device. */ if (ext4_has_feature_bigalloc(sb)) n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1); retry: o_blocks_count = ext4_blocks_count(es); ext4_msg(sb, KERN_INFO, "resizing filesystem from %llu " "to %llu blocks", o_blocks_count, n_blocks_count); if (n_blocks_count < o_blocks_count) { /* On-line shrinking not supported */ ext4_warning(sb, "can't shrink FS - resize aborted"); return -EINVAL; } if (n_blocks_count == o_blocks_count) /* Nothing need to do */ return 0; n_group = ext4_get_group_number(sb, n_blocks_count - 1); if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { ext4_warning(sb, "resize would cause inodes_count overflow"); return -EINVAL; } ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset); n_desc_blocks = num_desc_blocks(sb, n_group + 1); o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); meta_bg = ext4_has_feature_meta_bg(sb); if (ext4_has_feature_resize_inode(sb)) { if (meta_bg) { ext4_error(sb, "resize_inode and meta_bg enabled " "simultaneously"); return -EINVAL; } if (n_desc_blocks > o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks)) { n_blocks_count_retry = n_blocks_count; n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * EXT4_BLOCKS_PER_GROUP(sb) + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } if (!resize_inode) resize_inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(resize_inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); } } if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) { err = ext4_convert_meta_bg(sb, resize_inode); if (err) goto out; if (resize_inode) { iput(resize_inode); resize_inode = NULL; } if (n_blocks_count_retry) { n_blocks_count = n_blocks_count_retry; n_blocks_count_retry = 0; goto retry; } } /* * Make sure the last group has enough space so that it's * guaranteed to have enough space for all metadata blocks * that it might need to hold. (We might not need to store * the inode table blocks in the last block group, but there * will be cases where this might be needed.) */ if ((ext4_group_first_block_no(sb, n_group) + ext4_group_overhead_blocks(sb, n_group) + 2 + sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) { n_blocks_count = ext4_group_first_block_no(sb, n_group); n_group--; n_blocks_count_retry = 0; if (resize_inode) { iput(resize_inode); resize_inode = NULL; } goto retry; } /* extend the last group */ if (n_group == o_group) add = n_blocks_count - o_blocks_count; else add = EXT4_C2B(sbi, EXT4_CLUSTERS_PER_GROUP(sb) - (offset + 1)); if (add > 0) { err = ext4_group_extend_no_check(sb, o_blocks_count, add); if (err) goto out; } if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0) goto out; err = ext4_alloc_flex_bg_array(sb, n_group + 1); if (err) goto out; err = ext4_mb_alloc_groupinfo(sb, n_group + 1); if (err) goto out; flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group); if (flex_gd == NULL) { err = -ENOMEM; goto out; } /* Add flex groups. Note that a regular group is a * flex group with 1 group. */ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) { if (time_is_before_jiffies(last_update_time + HZ * 10)) { if (last_update_time) ext4_msg(sb, KERN_INFO, "resized to %llu blocks", ext4_blocks_count(es)); last_update_time = jiffies; } if (ext4_alloc_group_tables(sb, flex_gd, flexbg_size) != 0) break; err = ext4_flex_group_add(sb, resize_inode, flex_gd); if (unlikely(err)) break; } if (!err && n_blocks_count_retry) { n_blocks_count = n_blocks_count_retry; n_blocks_count_retry = 0; free_flex_gd(flex_gd); flex_gd = NULL; if (resize_inode) { iput(resize_inode); resize_inode = NULL; } goto retry; } out: if (flex_gd) free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); if (err) ext4_warning(sb, "error (%d) occurred during " "file system resize", err); ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", ext4_blocks_count(es)); return err; }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 // SPDX-License-Identifier: GPL-2.0 /* drivers/net/wireless/virt_wifi.c * * A fake implementation of cfg80211_ops that can be tacked on to an ethernet * net_device to make it appear as a wireless connection. * * Copyright (C) 2018 Google, Inc. * * Author: schuffelen@google.com */ #include <net/cfg80211.h> #include <net/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/math64.h> #include <linux/module.h> static struct wiphy *common_wiphy; struct virt_wifi_wiphy_priv { struct delayed_work scan_result; struct cfg80211_scan_request *scan_request; bool being_deleted; }; static struct ieee80211_channel channel_2ghz = { .band = NL80211_BAND_2GHZ, .center_freq = 2432, .hw_value = 2432, .max_power = 20, }; static struct ieee80211_rate bitrates_2ghz[] = { { .bitrate = 10 }, { .bitrate = 20 }, { .bitrate = 55 }, { .bitrate = 110 }, { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; static struct ieee80211_supported_band band_2ghz = { .channels = &channel_2ghz, .bitrates = bitrates_2ghz, .band = NL80211_BAND_2GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_2ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, }; static struct ieee80211_channel channel_5ghz = { .band = NL80211_BAND_5GHZ, .center_freq = 5240, .hw_value = 5240, .max_power = 20, }; static struct ieee80211_rate bitrates_5ghz[] = { { .bitrate = 60 }, { .bitrate = 120 }, { .bitrate = 240 }, }; #define RX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) #define TX_MCS_MAP (IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 | \ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14) static struct ieee80211_supported_band band_5ghz = { .channels = &channel_5ghz, .bitrates = bitrates_5ghz, .band = NL80211_BAND_5GHZ, .n_channels = 1, .n_bitrates = ARRAY_SIZE(bitrates_5ghz), .ht_cap = { .ht_supported = true, .cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | IEEE80211_HT_CAP_GRN_FLD | IEEE80211_HT_CAP_SGI_20 | IEEE80211_HT_CAP_SGI_40 | IEEE80211_HT_CAP_DSSSCCK40, .ampdu_factor = 0x3, .ampdu_density = 0x6, .mcs = { .rx_mask = {0xff, 0xff}, .tx_params = IEEE80211_HT_MCS_TX_DEFINED, }, }, .vht_cap = { .vht_supported = true, .cap = IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ | IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160 | IEEE80211_VHT_CAP_TXSTBC | IEEE80211_VHT_CAP_RXSTBC_1 | IEEE80211_VHT_CAP_RXSTBC_2 | IEEE80211_VHT_CAP_RXSTBC_3 | IEEE80211_VHT_CAP_RXSTBC_4 | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK, .vht_mcs = { .rx_mcs_map = cpu_to_le16(RX_MCS_MAP), .tx_mcs_map = cpu_to_le16(TX_MCS_MAP), } }, }; /* Assigned at module init. Guaranteed locally-administered and unicast. */ static u8 fake_router_bssid[ETH_ALEN] __ro_after_init = {}; #define VIRT_WIFI_SSID "VirtWifi" #define VIRT_WIFI_SSID_LEN 8 static void virt_wifi_inform_bss(struct wiphy *wiphy) { u64 tsf = div_u64(ktime_get_boottime_ns(), 1000); struct cfg80211_bss *informed_bss; static const struct { u8 tag; u8 len; u8 ssid[8] __nonstring; } __packed ssid = { .tag = WLAN_EID_SSID, .len = VIRT_WIFI_SSID_LEN, .ssid = VIRT_WIFI_SSID, }; informed_bss = cfg80211_inform_bss(wiphy, &channel_5ghz, CFG80211_BSS_FTYPE_PRESP, fake_router_bssid, tsf, WLAN_CAPABILITY_ESS, 0, (void *)&ssid, sizeof(ssid), DBM_TO_MBM(-50), GFP_KERNEL); cfg80211_put_bss(wiphy, informed_bss); } /* Called with the rtnl lock held. */ static int virt_wifi_scan(struct wiphy *wiphy, struct cfg80211_scan_request *request) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); wiphy_debug(wiphy, "scan\n"); if (priv->scan_request || priv->being_deleted) return -EBUSY; priv->scan_request = request; schedule_delayed_work(&priv->scan_result, HZ * 2); return 0; } /* Acquires and releases the rdev BSS lock. */ static void virt_wifi_scan_result(struct work_struct *work) { struct virt_wifi_wiphy_priv *priv = container_of(work, struct virt_wifi_wiphy_priv, scan_result.work); struct wiphy *wiphy = priv_to_wiphy(priv); struct cfg80211_scan_info scan_info = { .aborted = false }; virt_wifi_inform_bss(wiphy); /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } /* May acquire and release the rdev BSS lock. */ static void virt_wifi_cancel_scan(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv = wiphy_priv(wiphy); cancel_delayed_work_sync(&priv->scan_result); /* Clean up dangling callbacks if necessary. */ if (priv->scan_request) { struct cfg80211_scan_info scan_info = { .aborted = true }; /* Schedules work which acquires and releases the rtnl lock. */ cfg80211_scan_done(priv->scan_request, &scan_info); priv->scan_request = NULL; } } struct virt_wifi_netdev_priv { struct delayed_work connect; struct net_device *lowerdev; struct net_device *upperdev; u32 tx_packets; u32 tx_failed; u32 connect_requested_ssid_len; u8 connect_requested_ssid[IEEE80211_MAX_SSID_LEN]; u8 connect_requested_bss[ETH_ALEN]; bool is_up; bool is_connected; bool being_deleted; }; /* Called with the rtnl lock held. */ static int virt_wifi_connect(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_connect_params *sme) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); bool could_schedule; if (priv->being_deleted || !priv->is_up) return -EBUSY; if (!sme->ssid) return -EINVAL; priv->connect_requested_ssid_len = sme->ssid_len; memcpy(priv->connect_requested_ssid, sme->ssid, sme->ssid_len); could_schedule = schedule_delayed_work(&priv->connect, HZ * 2); if (!could_schedule) return -EBUSY; if (sme->bssid) { ether_addr_copy(priv->connect_requested_bss, sme->bssid); } else { virt_wifi_inform_bss(wiphy); eth_zero_addr(priv->connect_requested_bss); } wiphy_debug(wiphy, "connect\n"); return 0; } /* Acquires and releases the rdev event lock. */ static void virt_wifi_connect_complete(struct work_struct *work) { struct virt_wifi_netdev_priv *priv = container_of(work, struct virt_wifi_netdev_priv, connect.work); u8 *requested_bss = priv->connect_requested_bss; bool right_addr = ether_addr_equal(requested_bss, fake_router_bssid); bool right_ssid = priv->connect_requested_ssid_len == VIRT_WIFI_SSID_LEN && !memcmp(priv->connect_requested_ssid, VIRT_WIFI_SSID, priv->connect_requested_ssid_len); u16 status = WLAN_STATUS_SUCCESS; if (is_zero_ether_addr(requested_bss)) requested_bss = NULL; if (!priv->is_up || (requested_bss && !right_addr) || !right_ssid) status = WLAN_STATUS_UNSPECIFIED_FAILURE; else priv->is_connected = true; /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, requested_bss, NULL, 0, NULL, 0, status, GFP_KERNEL); netif_carrier_on(priv->upperdev); } /* May acquire and release the rdev event lock. */ static void virt_wifi_cancel_connect(struct net_device *netdev) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); /* If there is work pending, clean up dangling callbacks. */ if (cancel_delayed_work_sync(&priv->connect)) { /* Schedules an event that acquires the rtnl lock. */ cfg80211_connect_result(priv->upperdev, priv->connect_requested_bss, NULL, 0, NULL, 0, WLAN_STATUS_UNSPECIFIED_FAILURE, GFP_KERNEL); } } /* Called with the rtnl lock held. Acquires the rdev event lock. */ static int virt_wifi_disconnect(struct wiphy *wiphy, struct net_device *netdev, u16 reason_code) { struct virt_wifi_netdev_priv *priv = netdev_priv(netdev); if (priv->being_deleted) return -EBUSY; wiphy_debug(wiphy, "disconnect\n"); virt_wifi_cancel_connect(netdev); cfg80211_disconnected(netdev, reason_code, NULL, 0, true, GFP_KERNEL); priv->is_connected = false; netif_carrier_off(netdev); return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_get_station(struct wiphy *wiphy, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "get_station\n"); if (!priv->is_connected || !ether_addr_equal(mac, fake_router_bssid)) return -ENOENT; sinfo->filled = BIT_ULL(NL80211_STA_INFO_TX_PACKETS) | BIT_ULL(NL80211_STA_INFO_TX_FAILED) | BIT_ULL(NL80211_STA_INFO_SIGNAL) | BIT_ULL(NL80211_STA_INFO_TX_BITRATE); sinfo->tx_packets = priv->tx_packets; sinfo->tx_failed = priv->tx_failed; /* For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ */ sinfo->signal = -50; sinfo->txrate = (struct rate_info) { .legacy = 10, /* units are 100kbit/s */ }; return 0; } /* Called with the rtnl lock held. */ static int virt_wifi_dump_station(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); wiphy_debug(wiphy, "dump_station\n"); if (idx != 0 || !priv->is_connected) return -ENOENT; ether_addr_copy(mac, fake_router_bssid); return virt_wifi_get_station(wiphy, dev, fake_router_bssid, sinfo); } static const struct cfg80211_ops virt_wifi_cfg80211_ops = { .scan = virt_wifi_scan, .connect = virt_wifi_connect, .disconnect = virt_wifi_disconnect, .get_station = virt_wifi_get_station, .dump_station = virt_wifi_dump_station, }; /* Acquires and releases the rtnl lock. */ static struct wiphy *virt_wifi_make_wiphy(void) { struct wiphy *wiphy; struct virt_wifi_wiphy_priv *priv; int err; wiphy = wiphy_new(&virt_wifi_cfg80211_ops, sizeof(*priv)); if (!wiphy) return NULL; wiphy->max_scan_ssids = 4; wiphy->max_scan_ie_len = 1000; wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; wiphy->bands[NL80211_BAND_2GHZ] = &band_2ghz; wiphy->bands[NL80211_BAND_5GHZ] = &band_5ghz; wiphy->bands[NL80211_BAND_60GHZ] = NULL; wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION); priv = wiphy_priv(wiphy); priv->being_deleted = false; priv->scan_request = NULL; INIT_DELAYED_WORK(&priv->scan_result, virt_wifi_scan_result); err = wiphy_register(wiphy); if (err < 0) { wiphy_free(wiphy); return NULL; } return wiphy; } /* Acquires and releases the rtnl lock. */ static void virt_wifi_destroy_wiphy(struct wiphy *wiphy) { struct virt_wifi_wiphy_priv *priv; WARN(!wiphy, "%s called with null wiphy", __func__); if (!wiphy) return; priv = wiphy_priv(wiphy); priv->being_deleted = true; virt_wifi_cancel_scan(wiphy); if (wiphy->registered) wiphy_unregister(wiphy); wiphy_free(wiphy); } /* Enters and exits a RCU-bh critical section. */ static netdev_tx_t virt_wifi_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->tx_packets++; if (!priv->is_connected) { priv->tx_failed++; return NET_XMIT_DROP; } skb->dev = priv->lowerdev; return dev_queue_xmit(skb); } /* Called with rtnl lock held. */ static int virt_wifi_net_device_open(struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); priv->is_up = true; return 0; } /* Called with rtnl lock held. */ static int virt_wifi_net_device_stop(struct net_device *dev) { struct virt_wifi_netdev_priv *n_priv = netdev_priv(dev); n_priv->is_up = false; if (!dev->ieee80211_ptr) return 0; virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); virt_wifi_cancel_connect(dev); netif_carrier_off(dev); return 0; } static int virt_wifi_net_device_get_iflink(const struct net_device *dev) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); return READ_ONCE(priv->lowerdev->ifindex); } static const struct net_device_ops virt_wifi_ops = { .ndo_start_xmit = virt_wifi_start_xmit, .ndo_open = virt_wifi_net_device_open, .ndo_stop = virt_wifi_net_device_stop, .ndo_get_iflink = virt_wifi_net_device_get_iflink, }; /* Invoked as part of rtnl lock release. */ static void virt_wifi_net_device_destructor(struct net_device *dev) { /* Delayed past dellink to allow nl80211 to react to the device being * deleted. */ kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; } /* No lock interaction. */ static void virt_wifi_setup(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &virt_wifi_ops; dev->needs_free_netdev = true; } /* Called in a RCU read critical section from netif_receive_skb */ static rx_handler_result_t virt_wifi_rx_handler(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct virt_wifi_netdev_priv *priv = rcu_dereference(skb->dev->rx_handler_data); if (!priv->is_connected) return RX_HANDLER_PASS; /* GFP_ATOMIC because this is a packet interrupt handler. */ skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { dev_err(&priv->upperdev->dev, "can't skb_share_check\n"); return RX_HANDLER_CONSUMED; } *pskb = skb; skb->dev = priv->upperdev; skb->pkt_type = PACKET_HOST; return RX_HANDLER_ANOTHER; } /* Called with rtnl lock held. */ static int virt_wifi_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); struct net *link_net = rtnl_newlink_link_net(params); struct nlattr **tb = params->tb; int err; if (!tb[IFLA_LINK]) return -EINVAL; netif_carrier_off(dev); priv->upperdev = dev; priv->lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!priv->lowerdev) return -ENODEV; if (!tb[IFLA_MTU]) dev->mtu = priv->lowerdev->mtu; else if (dev->mtu > priv->lowerdev->mtu) return -EINVAL; err = netdev_rx_handler_register(priv->lowerdev, virt_wifi_rx_handler, priv); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_rx_handler_register: %d\n", err); return err; } eth_hw_addr_inherit(dev, priv->lowerdev); netif_stacked_transfer_operstate(priv->lowerdev, dev); SET_NETDEV_DEV(dev, &priv->lowerdev->dev); dev->ieee80211_ptr = kzalloc(sizeof(*dev->ieee80211_ptr), GFP_KERNEL); if (!dev->ieee80211_ptr) { err = -ENOMEM; goto remove_handler; } dev->ieee80211_ptr->iftype = NL80211_IFTYPE_STATION; dev->ieee80211_ptr->wiphy = common_wiphy; err = register_netdevice(dev); if (err) { dev_err(&priv->lowerdev->dev, "can't register_netdevice: %d\n", err); goto free_wireless_dev; } err = netdev_upper_dev_link(priv->lowerdev, dev, extack); if (err) { dev_err(&priv->lowerdev->dev, "can't netdev_upper_dev_link: %d\n", err); goto unregister_netdev; } dev->priv_destructor = virt_wifi_net_device_destructor; priv->being_deleted = false; priv->is_connected = false; priv->is_up = false; INIT_DELAYED_WORK(&priv->connect, virt_wifi_connect_complete); __module_get(THIS_MODULE); return 0; unregister_netdev: unregister_netdevice(dev); free_wireless_dev: kfree(dev->ieee80211_ptr); dev->ieee80211_ptr = NULL; remove_handler: netdev_rx_handler_unregister(priv->lowerdev); return err; } /* Called with rtnl lock held. */ static void virt_wifi_dellink(struct net_device *dev, struct list_head *head) { struct virt_wifi_netdev_priv *priv = netdev_priv(dev); if (dev->ieee80211_ptr) virt_wifi_cancel_scan(dev->ieee80211_ptr->wiphy); priv->being_deleted = true; virt_wifi_cancel_connect(dev); netif_carrier_off(dev); netdev_rx_handler_unregister(priv->lowerdev); netdev_upper_dev_unlink(priv->lowerdev, dev); unregister_netdevice_queue(dev, head); module_put(THIS_MODULE); /* Deleting the wiphy is handled in the module destructor. */ } static struct rtnl_link_ops virt_wifi_link_ops = { .kind = "virt_wifi", .setup = virt_wifi_setup, .newlink = virt_wifi_newlink, .dellink = virt_wifi_dellink, .priv_size = sizeof(struct virt_wifi_netdev_priv), }; static bool netif_is_virt_wifi_dev(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == virt_wifi_rx_handler; } static int virt_wifi_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *lower_dev = netdev_notifier_info_to_dev(ptr); struct virt_wifi_netdev_priv *priv; struct net_device *upper_dev; LIST_HEAD(list_kill); if (!netif_is_virt_wifi_dev(lower_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_UNREGISTER: priv = rtnl_dereference(lower_dev->rx_handler_data); if (!priv) return NOTIFY_DONE; upper_dev = priv->upperdev; upper_dev->rtnl_link_ops->dellink(upper_dev, &list_kill); unregister_netdevice_many(&list_kill); break; } return NOTIFY_DONE; } static struct notifier_block virt_wifi_notifier = { .notifier_call = virt_wifi_event, }; /* Acquires and releases the rtnl lock. */ static int __init virt_wifi_init_module(void) { int err; /* Guaranteed to be locally-administered and not multicast. */ eth_random_addr(fake_router_bssid); err = register_netdevice_notifier(&virt_wifi_notifier); if (err) return err; err = -ENOMEM; common_wiphy = virt_wifi_make_wiphy(); if (!common_wiphy) goto notifier; err = rtnl_link_register(&virt_wifi_link_ops); if (err) goto destroy_wiphy; return 0; destroy_wiphy: virt_wifi_destroy_wiphy(common_wiphy); notifier: unregister_netdevice_notifier(&virt_wifi_notifier); return err; } /* Acquires and releases the rtnl lock. */ static void __exit virt_wifi_cleanup_module(void) { /* Will delete any devices that depend on the wiphy. */ rtnl_link_unregister(&virt_wifi_link_ops); virt_wifi_destroy_wiphy(common_wiphy); unregister_netdevice_notifier(&virt_wifi_notifier); } module_init(virt_wifi_init_module); module_exit(virt_wifi_cleanup_module); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Cody Schuffelen <schuffelen@google.com>"); MODULE_DESCRIPTION("Driver for a wireless wrapper of ethernet devices"); MODULE_ALIAS_RTNL_LINK("virt_wifi");
6 6 6 221 221 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 // SPDX-License-Identifier: GPL-2.0 /* * f2fs sysfs interface * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ * Copyright (c) 2017 Chao Yu <chao@kernel.org> */ #include <linux/compiler.h> #include <linux/proc_fs.h> #include <linux/f2fs_fs.h> #include <linux/seq_file.h> #include <linux/unicode.h> #include <linux/ioprio.h> #include <linux/sysfs.h> #include "f2fs.h" #include "segment.h" #include "gc.h" #include "iostat.h" #include <trace/events/f2fs.h> static struct proc_dir_entry *f2fs_proc_root; /* Sysfs support for f2fs */ enum { GC_THREAD, /* struct f2fs_gc_thread */ SM_INFO, /* struct f2fs_sm_info */ DCC_INFO, /* struct discard_cmd_control */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ #ifdef CONFIG_F2FS_STAT_FS STAT_INFO, /* struct f2fs_stat_info */ #endif #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif RESERVED_BLOCKS, /* struct f2fs_sb_info */ CPRC_INFO, /* struct ckpt_req_control */ ATGC_INFO, /* struct atgc_management */ }; static const char *gc_mode_names[MAX_GC_MODE] = { "GC_NORMAL", "GC_IDLE_CB", "GC_IDLE_GREEDY", "GC_IDLE_AT", "GC_URGENT_HIGH", "GC_URGENT_LOW", "GC_URGENT_MID" }; struct f2fs_attr { struct attribute attr; ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t len); int struct_type; int offset; int id; }; struct f2fs_base_attr { struct attribute attr; ssize_t (*show)(struct f2fs_base_attr *a, char *buf); ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); }; static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) { if (struct_type == GC_THREAD) return (unsigned char *)sbi->gc_thread; else if (struct_type == SM_INFO) return (unsigned char *)SM_I(sbi); else if (struct_type == DCC_INFO) return (unsigned char *)SM_I(sbi)->dcc_info; else if (struct_type == NM_INFO) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) return (unsigned char *)sbi; #ifdef CONFIG_F2FS_FAULT_INJECTION else if (struct_type == FAULT_INFO_RATE || struct_type == FAULT_INFO_TYPE) return (unsigned char *)&F2FS_OPTION(sbi).fault_info; #endif #ifdef CONFIG_F2FS_STAT_FS else if (struct_type == STAT_INFO) return (unsigned char *)F2FS_STAT(sbi); #endif else if (struct_type == CPRC_INFO) return (unsigned char *)&sbi->cprc_info; else if (struct_type == ATGC_INFO) return (unsigned char *)&sbi->am; return NULL; } static ssize_t dirty_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)(dirty_segments(sbi))); } static ssize_t free_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)(free_segments(sbi))); } static ssize_t ovp_segments_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)(overprovision_segments(sbi))); } static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)(sbi->kbytes_written + ((f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1))); } static ssize_t sb_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%lx\n", sbi->s_flag); } static ssize_t cp_status_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags)); } static ssize_t pending_discard_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (!SM_I(sbi)->dcc_info) return -EINVAL; return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( &SM_I(sbi)->dcc_info->discard_cmd_cnt)); } static ssize_t issued_discard_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (!SM_I(sbi)->dcc_info) return -EINVAL; return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( &SM_I(sbi)->dcc_info->issued_discard)); } static ssize_t queued_discard_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (!SM_I(sbi)->dcc_info) return -EINVAL; return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read( &SM_I(sbi)->dcc_info->queued_discard)); } static ssize_t undiscard_blks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (!SM_I(sbi)->dcc_info) return -EINVAL; return sysfs_emit(buf, "%u\n", SM_I(sbi)->dcc_info->undiscard_blks); } static ssize_t atgc_enabled_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%d\n", sbi->am.atgc_enabled ? 1 : 0); } static ssize_t gc_mode_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]); } static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { int len = 0; if (f2fs_sb_has_encrypt(sbi)) len += sysfs_emit_at(buf, len, "%s", "encryption"); if (f2fs_sb_has_blkzoned(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "extra_attr"); if (f2fs_sb_has_project_quota(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "projquota"); if (f2fs_sb_has_inode_chksum(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "inode_checksum"); if (f2fs_sb_has_flexible_inline_xattr(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); if (f2fs_sb_has_quota_ino(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "quota_ino"); if (f2fs_sb_has_inode_crtime(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "inode_crtime"); if (f2fs_sb_has_lost_found(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "lost_found"); if (f2fs_sb_has_verity(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "verity"); if (f2fs_sb_has_sb_chksum(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "sb_checksum"); if (f2fs_sb_has_casefold(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "casefold"); if (f2fs_sb_has_readonly(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "readonly"); if (f2fs_sb_has_compression(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "compression"); len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "pin_file"); len += sysfs_emit_at(buf, len, "\n"); return len; } static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks); } static ssize_t unusable_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { block_t unusable; if (test_opt(sbi, DISABLE_CHECKPOINT)) unusable = sbi->unusable_block_count; else unusable = f2fs_get_unusable_blocks(sbi); return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable); } static ssize_t encoding_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { #if IS_ENABLED(CONFIG_UNICODE) struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n", (sb->s_encoding->version >> 16) & 0xff, (sb->s_encoding->version >> 8) & 0xff, sb->s_encoding->version & 0xff); #endif return sysfs_emit(buf, "(none)\n"); } static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time); } #ifdef CONFIG_F2FS_STAT_FS static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { struct f2fs_stat_info *si = F2FS_STAT(sbi); return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->tot_blks - (si->bg_data_blks + si->bg_node_blks))); } static ssize_t moved_blocks_background_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { struct f2fs_stat_info *si = F2FS_STAT(sbi); return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->bg_data_blks + si->bg_node_blks)); } static ssize_t avg_vblocks_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { struct f2fs_stat_info *si = F2FS_STAT(sbi); si->dirty_count = dirty_segments(sbi); f2fs_update_sit_info(sbi); return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks)); } #endif static ssize_t main_blkaddr_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)MAIN_BLKADDR(sbi)); } static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { unsigned char *ptr = NULL; unsigned int *ui; ptr = __struct_ptr(sbi, a->struct_type); if (!ptr) return -EINVAL; if (!strcmp(a->attr.name, "extension_list")) { __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list; int cold_count = le32_to_cpu(sbi->raw_super->extension_count); int hot_count = sbi->raw_super->hot_ext_count; int len = 0, i; len += sysfs_emit_at(buf, len, "cold file extension:\n"); for (i = 0; i < cold_count; i++) len += sysfs_emit_at(buf, len, "%s\n", extlist[i]); len += sysfs_emit_at(buf, len, "hot file extension:\n"); for (i = cold_count; i < cold_count + hot_count; i++) len += sysfs_emit_at(buf, len, "%s\n", extlist[i]); return len; } if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { struct ckpt_req_control *cprc = &sbi->cprc_info; int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio); int level = IOPRIO_PRIO_LEVEL(cprc->ckpt_thread_ioprio); if (class != IOPRIO_CLASS_RT && class != IOPRIO_CLASS_BE) return -EINVAL; return sysfs_emit(buf, "%s,%d\n", class == IOPRIO_CLASS_RT ? "rt" : "be", level); } #ifdef CONFIG_F2FS_FS_COMPRESSION if (!strcmp(a->attr.name, "compr_written_block")) return sysfs_emit(buf, "%llu\n", sbi->compr_written_block); if (!strcmp(a->attr.name, "compr_saved_block")) return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block); if (!strcmp(a->attr.name, "compr_new_inode")) return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); #endif if (!strcmp(a->attr.name, "gc_segment_mode")) return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { return sysfs_emit(buf, "%u\n", sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); } if (!strcmp(a->attr.name, "current_atomic_write")) { s64 current_write = atomic64_read(&sbi->current_atomic_write); return sysfs_emit(buf, "%lld\n", current_write); } if (!strcmp(a->attr.name, "peak_atomic_write")) return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write); if (!strcmp(a->attr.name, "committed_atomic_block")) return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block); if (!strcmp(a->attr.name, "revoked_atomic_block")) return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block); #ifdef CONFIG_F2FS_STAT_FS if (!strcmp(a->attr.name, "cp_foreground_calls")) return sysfs_emit(buf, "%d\n", atomic_read(&sbi->cp_call_count[TOTAL_CALL]) - atomic_read(&sbi->cp_call_count[BACKGROUND])); if (!strcmp(a->attr.name, "cp_background_calls")) return sysfs_emit(buf, "%d\n", atomic_read(&sbi->cp_call_count[BACKGROUND])); #endif ui = (unsigned int *)(ptr + a->offset); return sysfs_emit(buf, "%u\n", *ui); } static ssize_t __sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { unsigned char *ptr; unsigned long t; unsigned int *ui; ssize_t ret; ptr = __struct_ptr(sbi, a->struct_type); if (!ptr) return -EINVAL; if (!strcmp(a->attr.name, "extension_list")) { const char *name = strim((char *)buf); bool set = true, hot; if (!strncmp(name, "[h]", 3)) hot = true; else if (!strncmp(name, "[c]", 3)) hot = false; else return -EINVAL; name += 3; if (*name == '!') { name++; set = false; } if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; f2fs_down_write(&sbi->sb_lock); ret = f2fs_update_extension_list(sbi, name, hot, set); if (ret) goto out; ret = f2fs_commit_super(sbi, false); if (ret) f2fs_update_extension_list(sbi, name, hot, !set); out: f2fs_up_write(&sbi->sb_lock); return ret ? ret : count; } if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) { const char *name = strim((char *)buf); struct ckpt_req_control *cprc = &sbi->cprc_info; int class; long level; int ret; if (!strncmp(name, "rt,", 3)) class = IOPRIO_CLASS_RT; else if (!strncmp(name, "be,", 3)) class = IOPRIO_CLASS_BE; else return -EINVAL; name += 3; ret = kstrtol(name, 10, &level); if (ret) return ret; if (level >= IOPRIO_NR_LEVELS || level < 0) return -EINVAL; cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, level); if (test_opt(sbi, MERGE_CHECKPOINT)) { ret = set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); if (ret) return ret; } return count; } ui = (unsigned int *)(ptr + a->offset); ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE) { if (f2fs_build_fault_attr(sbi, 0, t)) return -EINVAL; return count; } if (a->struct_type == FAULT_INFO_RATE) { if (f2fs_build_fault_attr(sbi, t, 0)) return -EINVAL; return count; } #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } *ui = t; sbi->current_reserved_blocks = min(sbi->reserved_blocks, sbi->user_block_count - valid_user_blocks(sbi)); spin_unlock(&sbi->stat_lock); return count; } if (!strcmp(a->attr.name, "discard_io_aware_gran")) { if (t > MAX_PLIST_NUM) return -EINVAL; if (!f2fs_block_unit_discard(sbi)) return -EINVAL; if (t == *ui) return count; *ui = t; return count; } if (!strcmp(a->attr.name, "discard_granularity")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; if (!f2fs_block_unit_discard(sbi)) return -EINVAL; if (t == *ui) return count; *ui = t; return count; } if (!strcmp(a->attr.name, "max_ordered_discard")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; if (!f2fs_block_unit_discard(sbi)) return -EINVAL; *ui = t; return count; } if (!strcmp(a->attr.name, "discard_urgent_util")) { if (t > 100) return -EINVAL; *ui = t; return count; } if (!strcmp(a->attr.name, "discard_io_aware")) { if (t >= DPOLICY_IO_AWARE_MAX) return -EINVAL; *ui = t; return count; } if (!strcmp(a->attr.name, "migration_granularity")) { if (t == 0 || t > SEGS_PER_SEC(sbi)) return -EINVAL; } if (!strcmp(a->attr.name, "migration_window_granularity")) { if (t == 0 || t > SEGS_PER_SEC(sbi)) return -EINVAL; } if (!strcmp(a->attr.name, "gc_urgent")) { if (t == 0) { sbi->gc_mode = GC_NORMAL; } else if (t == 1) { sbi->gc_mode = GC_URGENT_HIGH; if (sbi->gc_thread) { sbi->gc_thread->gc_wake = true; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); } } else if (t == 2) { sbi->gc_mode = GC_URGENT_LOW; } else if (t == 3) { sbi->gc_mode = GC_URGENT_MID; if (sbi->gc_thread) { sbi->gc_thread->gc_wake = true; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); } } else { return -EINVAL; } return count; } if (!strcmp(a->attr.name, "gc_idle")) { if (t == GC_IDLE_CB) { sbi->gc_mode = GC_IDLE_CB; } else if (t == GC_IDLE_GREEDY) { sbi->gc_mode = GC_IDLE_GREEDY; } else if (t == GC_IDLE_AT) { if (!sbi->am.atgc_enabled) return -EINVAL; sbi->gc_mode = GC_IDLE_AT; } else { sbi->gc_mode = GC_NORMAL; } return count; } if (!strcmp(a->attr.name, "gc_remaining_trials")) { spin_lock(&sbi->gc_remaining_trials_lock); sbi->gc_remaining_trials = t; spin_unlock(&sbi->gc_remaining_trials_lock); return count; } #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; if (!sbi->iostat_enable) f2fs_reset_iostat(sbi); return count; } if (!strcmp(a->attr.name, "iostat_period_ms")) { if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS) return -EINVAL; spin_lock_irq(&sbi->iostat_lock); sbi->iostat_period_ms = (unsigned int)t; spin_unlock_irq(&sbi->iostat_lock); return count; } #endif #ifdef CONFIG_BLK_DEV_ZONED if (!strcmp(a->attr.name, "blkzone_alloc_policy")) { if (t < BLKZONE_ALLOC_PRIOR_SEQ || t > BLKZONE_ALLOC_PRIOR_CONV) return -EINVAL; sbi->blkzone_alloc_policy = t; return count; } #endif #ifdef CONFIG_F2FS_FS_COMPRESSION if (!strcmp(a->attr.name, "compr_written_block") || !strcmp(a->attr.name, "compr_saved_block")) { if (t != 0) return -EINVAL; sbi->compr_written_block = 0; sbi->compr_saved_block = 0; return count; } if (!strcmp(a->attr.name, "compr_new_inode")) { if (t != 0) return -EINVAL; sbi->compr_new_inode = 0; return count; } if (!strcmp(a->attr.name, "compress_percent")) { if (t == 0 || t > 100) return -EINVAL; *ui = t; return count; } if (!strcmp(a->attr.name, "compress_watermark")) { if (t == 0 || t > 100) return -EINVAL; *ui = t; return count; } #endif if (!strcmp(a->attr.name, "atgc_candidate_ratio")) { if (t > 100) return -EINVAL; sbi->am.candidate_ratio = t; return count; } if (!strcmp(a->attr.name, "atgc_age_weight")) { if (t > 100) return -EINVAL; sbi->am.age_weight = t; return count; } if (!strcmp(a->attr.name, "gc_segment_mode")) { if (t < MAX_GC_MODE) sbi->gc_segment_mode = t; else return -EINVAL; return count; } if (!strcmp(a->attr.name, "gc_pin_file_threshold")) { if (t > MAX_GC_FAILED_PINNED_FILES) return -EINVAL; sbi->gc_pin_file_threshold = t; return count; } if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { if (t != 0) return -EINVAL; sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0; return count; } if (!strcmp(a->attr.name, "seq_file_ra_mul")) { if (t >= MIN_RA_MUL && t <= MAX_RA_MUL) sbi->seq_file_ra_mul = t; else return -EINVAL; return count; } if (!strcmp(a->attr.name, "max_fragment_chunk")) { if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) sbi->max_fragment_chunk = t; else return -EINVAL; return count; } if (!strcmp(a->attr.name, "max_fragment_hole")) { if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) sbi->max_fragment_hole = t; else return -EINVAL; return count; } if (!strcmp(a->attr.name, "peak_atomic_write")) { if (t != 0) return -EINVAL; sbi->peak_atomic_write = 0; return count; } if (!strcmp(a->attr.name, "committed_atomic_block")) { if (t != 0) return -EINVAL; sbi->committed_atomic_block = 0; return count; } if (!strcmp(a->attr.name, "revoked_atomic_block")) { if (t != 0) return -EINVAL; sbi->revoked_atomic_block = 0; return count; } if (!strcmp(a->attr.name, "readdir_ra")) { sbi->readdir_ra = !!t; return count; } if (!strcmp(a->attr.name, "hot_data_age_threshold")) { if (t == 0 || t >= sbi->warm_data_age_threshold) return -EINVAL; if (t == *ui) return count; *ui = (unsigned int)t; return count; } if (!strcmp(a->attr.name, "warm_data_age_threshold")) { if (t <= sbi->hot_data_age_threshold) return -EINVAL; if (t == *ui) return count; *ui = (unsigned int)t; return count; } if (!strcmp(a->attr.name, "last_age_weight")) { if (t > 100) return -EINVAL; if (t == *ui) return count; *ui = (unsigned int)t; return count; } if (!strcmp(a->attr.name, "max_read_extent_count")) { if (t > UINT_MAX) return -EINVAL; *ui = (unsigned int)t; return count; } if (!strcmp(a->attr.name, "ipu_policy")) { if (t >= BIT(F2FS_IPU_MAX)) return -EINVAL; /* allow F2FS_IPU_NOCACHE only for IPU in the pinned file */ if (f2fs_lfs_mode(sbi) && (t & ~BIT(F2FS_IPU_NOCACHE))) return -EINVAL; SM_I(sbi)->ipu_policy = (unsigned int)t; return count; } if (!strcmp(a->attr.name, "dir_level")) { if (t > MAX_DIR_HASH_DEPTH) return -EINVAL; sbi->dir_level = t; return count; } *ui = (unsigned int)t; return count; } static ssize_t f2fs_sbi_store(struct f2fs_attr *a, struct f2fs_sb_info *sbi, const char *buf, size_t count) { ssize_t ret; bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || a->struct_type == GC_THREAD); if (gc_entry) { if (!down_read_trylock(&sbi->sb->s_umount)) return -EAGAIN; } ret = __sbi_store(a, sbi, buf, count); if (gc_entry) up_read(&sbi->sb->s_umount); return ret; } static ssize_t f2fs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_kobj); struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); return a->show ? a->show(a, sbi, buf) : 0; } static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_kobj); struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); return a->store ? a->store(a, sbi, buf, len) : 0; } static void f2fs_sb_release(struct kobject *kobj) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_kobj); complete(&sbi->s_kobj_unregister); } static ssize_t f2fs_base_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct f2fs_base_attr *a = container_of(attr, struct f2fs_base_attr, attr); return a->show ? a->show(a, buf) : 0; } static ssize_t f2fs_base_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct f2fs_base_attr *a = container_of(attr, struct f2fs_base_attr, attr); return a->store ? a->store(a, buf, len) : 0; } /* * Note that there are three feature list entries: * 1) /sys/fs/f2fs/features * : shows runtime features supported by in-kernel f2fs along with Kconfig. * - ref. F2FS_FEATURE_RO_ATTR() * * 2) /sys/fs/f2fs/$s_id/features <deprecated> * : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This * won't add new feature anymore, and thus, users should check entries in 3) * instead of this 2). * * 3) /sys/fs/f2fs/$s_id/feature_list * : shows on-disk features enabled by mkfs.f2fs per instance, which follows * sysfs entry rule where each entry should expose single value. * This list covers old feature list provided by 2) and beyond. Therefore, * please add new on-disk feature in this list only. * - ref. F2FS_SB_FEATURE_RO_ATTR() */ static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) { return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_feature_show, \ } static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf) { unsigned int res = 0; if (!strcmp(a->attr.name, "reclaim_caches_kb")) res = f2fs_donate_files(); return sysfs_emit(buf, "%u\n", res); } static ssize_t f2fs_tune_store(struct f2fs_base_attr *a, const char *buf, size_t count) { unsigned long t; int ret; ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret) return ret; if (!strcmp(a->attr.name, "reclaim_caches_kb")) f2fs_reclaim_caches(t); return count; } #define F2FS_TUNE_RW_ATTR(_name) \ static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0644 }, \ .show = f2fs_tune_show, \ .store = f2fs_tune_store, \ } static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { if (F2FS_HAS_FEATURE(sbi, a->id)) return sysfs_emit(buf, "supported\n"); return sysfs_emit(buf, "unsupported\n"); } #define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \ static struct f2fs_attr f2fs_attr_sb_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_sb_feature_show, \ .id = F2FS_FEATURE_##_feat, \ } #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ .struct_type = _struct_type, \ .offset = _offset \ } #define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0444, \ f2fs_sbi_show, NULL, \ offsetof(struct struct_name, elname)) #define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0644, \ f2fs_sbi_show, f2fs_sbi_store, \ offsetof(struct struct_name, elname)) #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) #ifdef CONFIG_F2FS_STAT_FS #define STAT_INFO_RO_ATTR(name, elname) \ F2FS_RO_ATTR(STAT_INFO, f2fs_stat_info, name, elname) #endif #define GC_THREAD_RW_ATTR(name, elname) \ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, name, elname) #define SM_INFO_RW_ATTR(name, elname) \ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, name, elname) #define SM_INFO_GENERAL_RW_ATTR(elname) \ SM_INFO_RW_ATTR(elname, elname) #define DCC_INFO_RW_ATTR(name, elname) \ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, name, elname) #define DCC_INFO_GENERAL_RW_ATTR(elname) \ DCC_INFO_RW_ATTR(elname, elname) #define NM_INFO_RW_ATTR(name, elname) \ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, name, elname) #define NM_INFO_GENERAL_RW_ATTR(elname) \ NM_INFO_RW_ATTR(elname, elname) #define F2FS_SBI_RW_ATTR(name, elname) \ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, name, elname) #define F2FS_SBI_GENERAL_RW_ATTR(elname) \ F2FS_SBI_RW_ATTR(elname, elname) #define F2FS_SBI_GENERAL_RO_ATTR(elname) \ F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, elname, elname) #ifdef CONFIG_F2FS_FAULT_INJECTION #define FAULT_INFO_GENERAL_RW_ATTR(type, elname) \ F2FS_RW_ATTR(type, f2fs_fault_info, elname, elname) #endif #define RESERVED_BLOCKS_GENERAL_RW_ATTR(elname) \ F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, elname, elname) #define CPRC_INFO_GENERAL_RW_ATTR(elname) \ F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, elname, elname) #define ATGC_INFO_RW_ATTR(name, elname) \ F2FS_RW_ATTR(ATGC_INFO, atgc_management, name, elname) /* GC_THREAD ATTR */ GC_THREAD_RW_ATTR(gc_urgent_sleep_time, urgent_sleep_time); GC_THREAD_RW_ATTR(gc_min_sleep_time, min_sleep_time); GC_THREAD_RW_ATTR(gc_max_sleep_time, max_sleep_time); GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time); GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent); GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio); /* SM_INFO ATTR */ SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments); SM_INFO_GENERAL_RW_ATTR(ipu_policy); SM_INFO_GENERAL_RW_ATTR(min_ipu_util); SM_INFO_GENERAL_RW_ATTR(min_fsync_blocks); SM_INFO_GENERAL_RW_ATTR(min_seq_blocks); SM_INFO_GENERAL_RW_ATTR(min_hot_blocks); SM_INFO_GENERAL_RW_ATTR(min_ssr_sections); SM_INFO_GENERAL_RW_ATTR(reserved_segments); /* DCC_INFO ATTR */ DCC_INFO_RW_ATTR(max_small_discards, max_discards); DCC_INFO_GENERAL_RW_ATTR(max_discard_request); DCC_INFO_GENERAL_RW_ATTR(min_discard_issue_time); DCC_INFO_GENERAL_RW_ATTR(mid_discard_issue_time); DCC_INFO_GENERAL_RW_ATTR(max_discard_issue_time); DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran); DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util); DCC_INFO_GENERAL_RW_ATTR(discard_granularity); DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard); DCC_INFO_GENERAL_RW_ATTR(discard_io_aware); /* NM_INFO ATTR */ NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks); NM_INFO_GENERAL_RW_ATTR(ram_thresh); NM_INFO_GENERAL_RW_ATTR(ra_nid_pages); NM_INFO_GENERAL_RW_ATTR(dirty_nats_ratio); /* F2FS_SBI ATTR */ F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list); F2FS_SBI_RW_ATTR(gc_idle, gc_mode); F2FS_SBI_RW_ATTR(gc_urgent, gc_mode); F2FS_SBI_RW_ATTR(cp_interval, interval_time[CP_TIME]); F2FS_SBI_RW_ATTR(idle_interval, interval_time[REQ_TIME]); F2FS_SBI_RW_ATTR(discard_idle_interval, interval_time[DISCARD_TIME]); F2FS_SBI_RW_ATTR(gc_idle_interval, interval_time[GC_TIME]); F2FS_SBI_RW_ATTR(umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); F2FS_SBI_RW_ATTR(gc_pin_file_thresh, gc_pin_file_threshold); F2FS_SBI_RW_ATTR(gc_reclaimed_segments, gc_reclaimed_segs); F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity); F2FS_SBI_GENERAL_RW_ATTR(dir_level); #ifdef CONFIG_F2FS_IOSTAT F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); #endif F2FS_SBI_GENERAL_RW_ATTR(readdir_ra); F2FS_SBI_GENERAL_RW_ATTR(max_io_bytes); F2FS_SBI_GENERAL_RW_ATTR(data_io_flag); F2FS_SBI_GENERAL_RW_ATTR(node_io_flag); F2FS_SBI_GENERAL_RW_ATTR(gc_remaining_trials); F2FS_SBI_GENERAL_RW_ATTR(seq_file_ra_mul); F2FS_SBI_GENERAL_RW_ATTR(gc_segment_mode); F2FS_SBI_GENERAL_RW_ATTR(max_fragment_chunk); F2FS_SBI_GENERAL_RW_ATTR(max_fragment_hole); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_SBI_GENERAL_RW_ATTR(compr_written_block); F2FS_SBI_GENERAL_RW_ATTR(compr_saved_block); F2FS_SBI_GENERAL_RW_ATTR(compr_new_inode); F2FS_SBI_GENERAL_RW_ATTR(compress_percent); F2FS_SBI_GENERAL_RW_ATTR(compress_watermark); #endif /* atomic write */ F2FS_SBI_GENERAL_RO_ATTR(current_atomic_write); F2FS_SBI_GENERAL_RW_ATTR(peak_atomic_write); F2FS_SBI_GENERAL_RW_ATTR(committed_atomic_block); F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block); /* block age extent cache */ F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); /* read extent cache */ F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); #ifdef CONFIG_BLK_DEV_ZONED F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS STAT_INFO_RO_ATTR(cp_foreground_calls, cp_call_count[FOREGROUND]); STAT_INFO_RO_ATTR(cp_background_calls, cp_call_count[BACKGROUND]); STAT_INFO_RO_ATTR(gc_foreground_calls, gc_call_count[FOREGROUND]); STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]); #endif /* FAULT_INFO ATTR */ #ifdef CONFIG_F2FS_FAULT_INJECTION FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate); FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type); #endif /* RESERVED_BLOCKS ATTR */ RESERVED_BLOCKS_GENERAL_RW_ATTR(reserved_blocks); /* CPRC_INFO ATTR */ CPRC_INFO_GENERAL_RW_ATTR(ckpt_thread_ioprio); /* ATGC_INFO ATTR */ ATGC_INFO_RW_ATTR(atgc_candidate_ratio, candidate_ratio); ATGC_INFO_RW_ATTR(atgc_candidate_count, max_candidate_count); ATGC_INFO_RW_ATTR(atgc_age_weight, age_weight); ATGC_INFO_RW_ATTR(atgc_age_threshold, age_threshold); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); F2FS_GENERAL_RO_ATTR(ovp_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); F2FS_GENERAL_RO_ATTR(atgc_enabled); F2FS_GENERAL_RO_ATTR(gc_mode); #ifdef CONFIG_F2FS_STAT_FS F2FS_GENERAL_RO_ATTR(moved_blocks_background); F2FS_GENERAL_RO_ATTR(moved_blocks_foreground); F2FS_GENERAL_RO_ATTR(avg_vblocks); #endif #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption); F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2); #if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned); #endif F2FS_FEATURE_RO_ATTR(atomic_write); F2FS_FEATURE_RO_ATTR(extra_attr); F2FS_FEATURE_RO_ATTR(project_quota); F2FS_FEATURE_RO_ATTR(inode_checksum); F2FS_FEATURE_RO_ATTR(flexible_inline_xattr); F2FS_FEATURE_RO_ATTR(quota_ino); F2FS_FEATURE_RO_ATTR(inode_crtime); F2FS_FEATURE_RO_ATTR(lost_found); #ifdef CONFIG_FS_VERITY F2FS_FEATURE_RO_ATTR(verity); #endif F2FS_FEATURE_RO_ATTR(sb_checksum); #if IS_ENABLED(CONFIG_UNICODE) F2FS_FEATURE_RO_ATTR(casefold); #endif F2FS_FEATURE_RO_ATTR(readonly); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression); #endif F2FS_FEATURE_RO_ATTR(pin_file); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), ATTR_LIST(gc_min_sleep_time), ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_no_zoned_gc_percent), ATTR_LIST(gc_boost_zoned_gc_percent), ATTR_LIST(gc_valid_thresh_ratio), ATTR_LIST(gc_idle), ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(max_discard_request), ATTR_LIST(min_discard_issue_time), ATTR_LIST(mid_discard_issue_time), ATTR_LIST(max_discard_issue_time), ATTR_LIST(discard_io_aware_gran), ATTR_LIST(discard_urgent_util), ATTR_LIST(discard_granularity), ATTR_LIST(max_ordered_discard), ATTR_LIST(discard_io_aware), ATTR_LIST(pending_discard), ATTR_LIST(gc_mode), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), ATTR_LIST(min_seq_blocks), ATTR_LIST(min_hot_blocks), ATTR_LIST(min_ssr_sections), ATTR_LIST(reserved_segments), ATTR_LIST(max_victim_search), ATTR_LIST(migration_granularity), ATTR_LIST(migration_window_granularity), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(dirty_nats_ratio), ATTR_LIST(max_roll_forward_node_blocks), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), #ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), #endif ATTR_LIST(readdir_ra), ATTR_LIST(max_io_bytes), ATTR_LIST(gc_pin_file_thresh), ATTR_LIST(extension_list), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), ATTR_LIST(gc_remaining_trials), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), ATTR_LIST(ovp_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), ATTR_LIST(cp_background_calls), ATTR_LIST(gc_foreground_calls), ATTR_LIST(gc_background_calls), ATTR_LIST(moved_blocks_foreground), ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), #endif #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(unusable_blocks_per_sec), ATTR_LIST(blkzone_alloc_policy), #endif #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compr_written_block), ATTR_LIST(compr_saved_block), ATTR_LIST(compr_new_inode), ATTR_LIST(compress_percent), ATTR_LIST(compress_watermark), #endif /* For ATGC */ ATTR_LIST(atgc_candidate_ratio), ATTR_LIST(atgc_candidate_count), ATTR_LIST(atgc_age_weight), ATTR_LIST(atgc_age_threshold), ATTR_LIST(atgc_enabled), ATTR_LIST(seq_file_ra_mul), ATTR_LIST(gc_segment_mode), ATTR_LIST(gc_reclaimed_segments), ATTR_LIST(max_fragment_chunk), ATTR_LIST(max_fragment_hole), ATTR_LIST(current_atomic_write), ATTR_LIST(peak_atomic_write), ATTR_LIST(committed_atomic_block), ATTR_LIST(revoked_atomic_block), ATTR_LIST(hot_data_age_threshold), ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), NULL, }; ATTRIBUTE_GROUPS(f2fs); #define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION BASE_ATTR_LIST(encryption), BASE_ATTR_LIST(test_dummy_encryption_v2), #if IS_ENABLED(CONFIG_UNICODE) BASE_ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED BASE_ATTR_LIST(block_zoned), #endif BASE_ATTR_LIST(atomic_write), BASE_ATTR_LIST(extra_attr), BASE_ATTR_LIST(project_quota), BASE_ATTR_LIST(inode_checksum), BASE_ATTR_LIST(flexible_inline_xattr), BASE_ATTR_LIST(quota_ino), BASE_ATTR_LIST(inode_crtime), BASE_ATTR_LIST(lost_found), #ifdef CONFIG_FS_VERITY BASE_ATTR_LIST(verity), #endif BASE_ATTR_LIST(sb_checksum), #if IS_ENABLED(CONFIG_UNICODE) BASE_ATTR_LIST(casefold), #endif BASE_ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION BASE_ATTR_LIST(compression), #endif BASE_ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); F2FS_GENERAL_RO_ATTR(sb_status); F2FS_GENERAL_RO_ATTR(cp_status); F2FS_GENERAL_RO_ATTR(issued_discard); F2FS_GENERAL_RO_ATTR(queued_discard); F2FS_GENERAL_RO_ATTR(undiscard_blks); static struct attribute *f2fs_stat_attrs[] = { ATTR_LIST(sb_status), ATTR_LIST(cp_status), ATTR_LIST(issued_discard), ATTR_LIST(queued_discard), ATTR_LIST(undiscard_blks), NULL, }; ATTRIBUTE_GROUPS(f2fs_stat); F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT); F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED); F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR); F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA); F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM); F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO); F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME); F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND); F2FS_SB_FEATURE_RO_ATTR(verity, VERITY); F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); F2FS_SB_FEATURE_RO_ATTR(readonly, RO); F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS); static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_encryption), ATTR_LIST(sb_block_zoned), ATTR_LIST(sb_extra_attr), ATTR_LIST(sb_project_quota), ATTR_LIST(sb_inode_checksum), ATTR_LIST(sb_flexible_inline_xattr), ATTR_LIST(sb_quota_ino), ATTR_LIST(sb_inode_crtime), ATTR_LIST(sb_lost_found), ATTR_LIST(sb_verity), ATTR_LIST(sb_sb_checksum), ATTR_LIST(sb_casefold), ATTR_LIST(sb_compression), ATTR_LIST(sb_readonly), ATTR_LIST(sb_device_alias), NULL, }; ATTRIBUTE_GROUPS(f2fs_sb_feat); F2FS_TUNE_RW_ATTR(reclaim_caches_kb); static struct attribute *f2fs_tune_attrs[] = { BASE_ATTR_LIST(reclaim_caches_kb), NULL, }; ATTRIBUTE_GROUPS(f2fs_tune); static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, }; static const struct kobj_type f2fs_sb_ktype = { .default_groups = f2fs_groups, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; static const struct kobj_type f2fs_ktype = { .sysfs_ops = &f2fs_attr_ops, }; static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; static const struct sysfs_ops f2fs_feat_attr_ops = { .show = f2fs_base_attr_show, .store = f2fs_base_attr_store, }; static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, .sysfs_ops = &f2fs_feat_attr_ops, }; static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; static const struct sysfs_ops f2fs_tune_attr_ops = { .show = f2fs_base_attr_show, .store = f2fs_base_attr_store, }; static const struct kobj_type f2fs_tune_ktype = { .default_groups = f2fs_tune_groups, .sysfs_ops = &f2fs_tune_attr_ops, }; static struct kobject f2fs_tune = { .kset = &f2fs_kset, }; static ssize_t f2fs_stat_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_stat_kobj); struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); return a->show ? a->show(a, sbi, buf) : 0; } static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t len) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_stat_kobj); struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); return a->store ? a->store(a, sbi, buf, len) : 0; } static void f2fs_stat_kobj_release(struct kobject *kobj) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_stat_kobj); complete(&sbi->s_stat_kobj_unregister); } static const struct sysfs_ops f2fs_stat_attr_ops = { .show = f2fs_stat_attr_show, .store = f2fs_stat_attr_store, }; static const struct kobj_type f2fs_stat_ktype = { .default_groups = f2fs_stat_groups, .sysfs_ops = &f2fs_stat_attr_ops, .release = f2fs_stat_kobj_release, }; static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_feature_list_kobj); struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); return a->show ? a->show(a, sbi, buf) : 0; } static void f2fs_feature_list_kobj_release(struct kobject *kobj) { struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, s_feature_list_kobj); complete(&sbi->s_feature_list_kobj_unregister); } static const struct sysfs_ops f2fs_feature_list_attr_ops = { .show = f2fs_sb_feat_attr_show, }; static const struct kobj_type f2fs_feature_list_ktype = { .default_groups = f2fs_sb_feat_groups, .sysfs_ops = &f2fs_feature_list_attr_ops, .release = f2fs_feature_list_kobj_release, }; static int __maybe_unused segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); int i; seq_puts(seq, "format: segment_type|valid_blocks\n" "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); for (i = 0; i < total_segs; i++) { struct seg_entry *se = get_seg_entry(sbi, i); if ((i % 10) == 0) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, se->valid_blocks); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else seq_putc(seq, ' '); } return 0; } static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main); int i, j; seq_puts(seq, "format: segment_type|valid_blocks|bitmaps|mtime\n" "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); for (i = 0; i < total_segs; i++) { struct seg_entry *se = get_seg_entry(sbi, i); seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u|", se->type, se->valid_blocks); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_printf(seq, "| %llx", se->mtime); seq_putc(seq, '\n'); } return 0; } static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); int i; seq_puts(seq, "format: victim_secmap bitmaps\n"); for (i = 0; i < MAIN_SECS(sbi); i++) { if ((i % 10) == 0) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0); if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1)) seq_putc(seq, '\n'); else seq_putc(seq, ' '); } return 0; } static int __maybe_unused discard_plist_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; int i, count; seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n"); if (!f2fs_realtime_discard_enable(sbi)) return 0; if (dcc) { mutex_lock(&dcc->cmd_lock); for (i = 0; i < MAX_PLIST_NUM; i++) { struct list_head *pend_list; struct discard_cmd *dc, *tmp; if (i % 8 == 0) seq_printf(seq, " %-3d", i); count = 0; pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) count++; if (count) seq_printf(seq, " %7d", count); else seq_puts(seq, " ."); if (i % 8 == 7) seq_putc(seq, '\n'); } seq_putc(seq, '\n'); mutex_unlock(&dcc->cmd_lock); } return 0; } static int __maybe_unused disk_map_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; seq_printf(seq, "Address Layout : %5luB Block address (# of Segments)\n", F2FS_BLKSIZE); seq_printf(seq, " SB : %12s\n", "0/1024B"); seq_printf(seq, " seg0_blkaddr : 0x%010x\n", SEG0_BLKADDR(sbi)); seq_printf(seq, " Checkpoint : 0x%010x (%10d)\n", le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr), 2); seq_printf(seq, " SIT : 0x%010x (%10d)\n", SIT_I(sbi)->sit_base_addr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_sit)); seq_printf(seq, " NAT : 0x%010x (%10d)\n", NM_I(sbi)->nat_blkaddr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_nat)); seq_printf(seq, " SSA : 0x%010x (%10d)\n", SM_I(sbi)->ssa_blkaddr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_ssa)); seq_printf(seq, " Main : 0x%010x (%10d)\n", SM_I(sbi)->main_blkaddr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main)); seq_printf(seq, " # of Sections : %12d\n", le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); seq_printf(seq, " Segs/Sections : %12d\n", SEGS_PER_SEC(sbi)); seq_printf(seq, " Section size : %12d MB\n", SEGS_PER_SEC(sbi) << 1); if (!f2fs_is_multi_device(sbi)) return 0; seq_puts(seq, "\nDisk Map for multi devices:\n"); for (i = 0; i < sbi->s_ndevs; i++) seq_printf(seq, "Disk:%2d (zoned=%d): 0x%010x - 0x%010x on %s\n", i, bdev_is_zoned(FDEV(i).bdev), FDEV(i).start_blk, FDEV(i).end_blk, FDEV(i).path); return 0; } int __init f2fs_init_sysfs(void) { int ret; kobject_set_name(&f2fs_kset.kobj, "f2fs"); f2fs_kset.kobj.parent = fs_kobj; ret = kset_register(&f2fs_kset); if (ret) return ret; ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, NULL, "features"); if (ret) goto put_kobject; ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype, NULL, "tuning"); if (ret) goto put_kobject; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); if (!f2fs_proc_root) { ret = -ENOMEM; goto put_kobject; } return 0; put_kobject: kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); return ret; } void f2fs_exit_sysfs(void) { kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); f2fs_proc_root = NULL; } int f2fs_register_sysfs(struct f2fs_sb_info *sbi) { struct super_block *sb = sbi->sb; int err; sbi->s_kobj.kset = &f2fs_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, "%s", sb->s_id); if (err) goto put_sb_kobj; sbi->s_stat_kobj.kset = &f2fs_kset; init_completion(&sbi->s_stat_kobj_unregister); err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype, &sbi->s_kobj, "stat"); if (err) goto put_stat_kobj; sbi->s_feature_list_kobj.kset = &f2fs_kset; init_completion(&sbi->s_feature_list_kobj_unregister); err = kobject_init_and_add(&sbi->s_feature_list_kobj, &f2fs_feature_list_ktype, &sbi->s_kobj, "feature_list"); if (err) goto put_feature_list_kobj; sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (!sbi->s_proc) { err = -ENOMEM; goto put_feature_list_kobj; } proc_create_single_data("segment_info", 0444, sbi->s_proc, segment_info_seq_show, sb); proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); #ifdef CONFIG_F2FS_IOSTAT proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); #endif proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); proc_create_single_data("discard_plist_info", 0444, sbi->s_proc, discard_plist_seq_show, sb); proc_create_single_data("disk_map", 0444, sbi->s_proc, disk_map_seq_show, sb); return 0; put_feature_list_kobj: kobject_put(&sbi->s_feature_list_kobj); wait_for_completion(&sbi->s_feature_list_kobj_unregister); put_stat_kobj: kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); put_sb_kobj: kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); return err; } void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root); kobject_put(&sbi->s_stat_kobj); wait_for_completion(&sbi->s_stat_kobj_unregister); kobject_put(&sbi->s_feature_list_kobj); wait_for_completion(&sbi->s_feature_list_kobj_unregister); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); }
203 203 1 1 75 11 4 68 68 68 68 4 4 4 4 2 1 2 1 1 1 18 5 1 4 279 145 183 146 76 64 64 64 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Red Hat. All rights reserved. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/security.h> #include <linux/posix_acl_xattr.h> #include <linux/iversion.h> #include <linux/sched/mm.h> #include "ctree.h" #include "fs.h" #include "messages.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" #include "disk-io.h" #include "props.h" #include "locking.h" #include "accessors.h" #include "dir-item.h" int btrfs_getxattr(const struct inode *inode, const char *name, void *buffer, size_t size) { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct extent_buffer *leaf; int ret = 0; unsigned long data_ptr; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* lookup the xattr by name */ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, strlen(name), 0); if (!di) { ret = -ENODATA; goto out; } else if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; } leaf = path->nodes[0]; /* if size is 0, that means we want the size of the attr */ if (!size) { ret = btrfs_dir_data_len(leaf, di); goto out; } /* now get the data out of our dir_item */ if (btrfs_dir_data_len(leaf, di) > size) { ret = -ERANGE; goto out; } /* * The way things are packed into the leaf is like this * |struct btrfs_dir_item|name|data| * where name is the xattr name, so security.foo, and data is the * content of the xattr. data_ptr points to the location in memory * where the data starts in the in memory leaf */ data_ptr = (unsigned long)((char *)(di + 1) + btrfs_dir_name_len(leaf, di)); read_extent_buffer(leaf, buffer, data_ptr, btrfs_dir_data_len(leaf, di)); ret = btrfs_dir_data_len(leaf, di); out: btrfs_free_path(path); return ret; } int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; ASSERT(trans); if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->skip_release_on_error = 1; if (!value) { di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, -1); if (!di && (flags & XATTR_REPLACE)) ret = -ENODATA; else if (IS_ERR(di)) ret = PTR_ERR(di); else if (di) ret = btrfs_delete_one_dir_name(trans, root, path, di); goto out; } /* * For a replace we can't just do the insert blindly. * Do a lookup first (read-only btrfs_search_slot), and return if xattr * doesn't exist. If it exists, fall down below to the insert/replace * path - we can't race with a concurrent xattr delete, because the VFS * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { btrfs_assert_inode_locked(BTRFS_I(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, 0); if (!di) ret = -ENODATA; else if (IS_ERR(di)) ret = PTR_ERR(di); if (ret) goto out; btrfs_release_path(path); di = NULL; } ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(BTRFS_I(inode)), name, name_len, value, size); if (ret == -EOVERFLOW) { /* * We have an existing item in a leaf, split_leaf couldn't * expand it. That item might have or not a dir_item that * matches our target xattr, so lets check. */ ret = 0; btrfs_assert_tree_write_locked(path->nodes[0]); di = btrfs_match_dir_item_name(path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; di = btrfs_match_dir_item_name(path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; } if (di && (flags & XATTR_CREATE)) { ret = -EEXIST; goto out; } if (di) { /* * We're doing a replace, and it must be atomic, that is, at * any point in time we have either the old or the new xattr * value in the tree. We don't want readers (getxattr and * listxattrs) to miss a value, this is specially important * for ACLs. */ const int slot = path->slots[0]; struct extent_buffer *leaf = path->nodes[0]; const u16 old_data_len = btrfs_dir_data_len(leaf, di); const u32 item_size = btrfs_item_size(leaf, slot); const u32 data_size = sizeof(*di) + name_len + size; unsigned long data_ptr; char *ptr; if (size > old_data_len) { if (btrfs_leaf_free_space(leaf) < (size - old_data_len)) { ret = -ENOSPC; goto out; } } if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) btrfs_extend_item(trans, path, size - old_data_len); else if (size < old_data_len) btrfs_truncate_item(trans, path, data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; btrfs_extend_item(trans, path, data_size); } ptr = btrfs_item_ptr(leaf, slot, char); ptr += btrfs_item_size(leaf, slot) - data_size; di = (struct btrfs_dir_item *)ptr; btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is * where our xattr dir_item is and btrfs_insert_xattr_item() * filled it. */ } out: btrfs_free_path(path); if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); clear_bit(BTRFS_INODE_NO_XATTRS, &BTRFS_I(inode)->runtime_flags); } return ret; } /* * @value: "" makes the attribute to empty, NULL removes it */ int btrfs_setxattr_trans(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; const bool start_trans = (current->journal_info == NULL); int ret; if (start_trans) { /* * 1 unit for inserting/updating/deleting the xattr * 1 unit for the inode item update */ trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); } else { /* * This can happen when smack is enabled and a directory is being * created. It happens through d_instantiate_new(), which calls * smack_d_instantiate(), which in turn calls __vfs_setxattr() to * set the transmute xattr (XATTR_NAME_SMACKTRANSMUTE) on the * inode. We have already reserved space for the xattr and inode * update at btrfs_mkdir(), so just use the transaction handle. * We don't join or start a transaction, as that will reset the * block_rsv of the handle and trigger a warning for the start * case. */ ASSERT(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0); trans = current->journal_info; } ret = btrfs_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; inode_inc_iversion(inode); inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); out: if (start_trans) btrfs_end_transaction(trans); return ret; } ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key found_key; struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; /* * ok we want all objects associated with this id. * NOTE: we set key.offset = 0; because we want to start with the * first xattr that we find and walk forward */ key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; /* search for our xattrs */ btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf; int slot; struct btrfs_dir_item *di; u32 item_size; u32 cur; leaf = path->nodes[0]; slot = path->slots[0]; /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; if (found_key.type < BTRFS_XATTR_ITEM_KEY) continue; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); item_size = btrfs_item_size(leaf, slot); cur = 0; while (cur < item_size) { u16 name_len = btrfs_dir_name_len(leaf, di); u16 data_len = btrfs_dir_data_len(leaf, di); u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); total_size += name_len + 1; /* * We are just looking for how big our buffer needs to * be. */ if (!size) goto next; if (!buffer || (name_len + 1) > size_left) { iter_ret = -ERANGE; break; } read_extent_buffer(leaf, buffer, name_ptr, name_len); buffer[name_len] = '\0'; size_left -= name_len + 1; buffer += name_len + 1; next: cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } } if (iter_ret < 0) ret = iter_ret; else ret = total_size; btrfs_free_path(path); return ret; } static int btrfs_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { name = xattr_full_name(handler, name); return btrfs_getxattr(inode, name, buffer, size); } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { if (btrfs_root_readonly(BTRFS_I(inode)->root)) return -EROFS; name = xattr_full_name(handler, name); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { int ret; bool is_cap = false; name = xattr_full_name(handler, name); /* * security.capability doesn't cache the results, so calls into us * constantly to see if there's a capability xattr. Cache the result * here in order to avoid wasting time doing lookups for xattrs we know * don't exist. */ if (strcmp(name, XATTR_NAME_CAPS) == 0) { is_cap = true; if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags)) return -ENODATA; } ret = btrfs_getxattr(inode, name, buffer, size); if (ret == -ENODATA && is_cap) set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); return ret; } static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { if (btrfs_root_readonly(BTRFS_I(inode)->root)) return -EROFS; name = xattr_full_name(handler, name); if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); return btrfs_setxattr_trans(inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { int ret; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; name = xattr_full_name(handler, name); ret = btrfs_validate_prop(BTRFS_I(inode), name, value, size); if (ret) return ret; if (btrfs_ignore_prop(BTRFS_I(inode), name)) return 0; trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_set_prop(trans, BTRFS_I(inode), name, value, size, flags); if (!ret) { inode_inc_iversion(inode); inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); } btrfs_end_transaction(trans); return ret; } static const struct xattr_handler btrfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = btrfs_xattr_handler_get_security, .set = btrfs_xattr_handler_set_security, }; static const struct xattr_handler btrfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = btrfs_xattr_handler_get, .set = btrfs_xattr_handler_set, }; static const struct xattr_handler btrfs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = btrfs_xattr_handler_get, .set = btrfs_xattr_handler_set, }; static const struct xattr_handler btrfs_btrfs_xattr_handler = { .prefix = XATTR_BTRFS_PREFIX, .get = btrfs_xattr_handler_get, .set = btrfs_xattr_handler_set_prop, }; const struct xattr_handler * const btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, &btrfs_btrfs_xattr_handler, NULL, }; static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_private) { struct btrfs_trans_handle *trans = fs_private; const struct xattr *xattr; unsigned int nofs_flag; char *name; int ret = 0; /* * We're holding a transaction handle, so use a NOFS memory allocation * context to avoid deadlock if reclaim happens. */ nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { ret = -ENOMEM; break; } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); if (strcmp(name, XATTR_NAME_CAPS) == 0) clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); ret = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); if (ret < 0) break; } memalloc_nofs_restore(nofs_flag); return ret; } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) { return security_inode_init_security(inode, dir, qstr, &btrfs_initxattrs, trans); }
97 50 96 31 97 97 72 97 97 95 2 2 21 21 10 21 21 46 46 46 46 46 38 11 1 45 45 45 1 45 45 45 25 27 10 40 45 45 45 1 1 1 6 22 41 30 30 30 12 27 28 29 4 4 4 4 4 23 23 23 2 23 23 2 23 23 21 2 29 30 30 30 10 21 30 30 3 30 30 1 1 30 30 30 15 15 3 1 1 1 1 15 2 2 2 2 2 2 2 2 2 2 23 6 17 23 17 13 6 23 23 17 6 6 6 23 17 6 13 19 6 10 22 23 23 23 23 23 6 17 23 20 10 1 23 1 5 2 2 2 2 5 5 5 1 5 2 2 2 2 2 2 2 2 2 1 1 1 5 5 5 5 5 1 23 18 5 3 23 23 23 23 3 23 10 10 15 21 21 21 51 52 1 7 44 43 23 21 9 9 75 186 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include "bcachefs.h" #include "alloc_foreground.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" #include "clock.h" #include "compress.h" #include "debug.h" #include "ec.h" #include "error.h" #include "extent_update.h" #include "inode.h" #include "io_write.h" #include "journal.h" #include "keylist.h" #include "move.h" #include "nocow_locking.h" #include "rebalance.h" #include "subvolume.h" #include "super.h" #include "super-io.h" #include "trace.h" #include <linux/blkdev.h> #include <linux/prefetch.h> #include <linux/random.h> #include <linux/sched/mm.h> #ifdef CONFIG_BCACHEFS_DEBUG static unsigned bch2_write_corrupt_ratio; module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(write_corrupt_ratio, ""); #endif #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, u64 now, int rw) { u64 latency_capable = ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; /* ideally we'd be taking into account the device's variance here: */ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); s64 latency_over = io_latency - latency_threshold; if (latency_threshold && latency_over > 0) { /* * bump up congested by approximately latency_over * 4 / * latency_threshold - we don't need much accuracy here so don't * bother with the divide: */ if (atomic_read(&ca->congested) < CONGESTED_MAX) atomic_add(latency_over >> max_t(int, ilog2(latency_threshold) - 2, 0), &ca->congested); ca->congested_last = now; } else if (atomic_read(&ca->congested) > 0) { atomic_dec(&ca->congested); } } void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) { atomic64_t *latency = &ca->cur_latency[rw]; u64 now = local_clock(); u64 io_latency = time_after64(now, submit_time) ? now - submit_time : 0; u64 old, new; old = atomic64_read(latency); do { /* * If the io latency was reasonably close to the current * latency, skip doing the update and atomic operation - most of * the time: */ if (abs((int) (old - io_latency)) < (old >> 1) && now & ~(~0U << 5)) break; new = ewma_add(old, io_latency, 5); } while (!atomic64_try_cmpxchg(latency, &old, new)); bch2_congested_acct(ca, io_latency, now, rw); __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } #endif /* Allocate, free from mempool: */ void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { struct bvec_iter_all iter; struct bio_vec *bv; bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; } static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) { struct page *page; if (likely(!*using_mempool)) { page = alloc_page(GFP_NOFS); if (unlikely(!page)) { mutex_lock(&c->bio_bounce_pages_lock); *using_mempool = true; goto pool_alloc; } } else { pool_alloc: page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); } return page; } void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, size_t size) { bool using_mempool = false; while (size) { struct page *page = __bio_alloc_page_pool(c, &using_mempool); unsigned len = min_t(size_t, PAGE_SIZE, size); BUG_ON(!bio_add_page(bio, page, len, 0)); size -= len; } if (using_mempool) mutex_unlock(&c->bio_bounce_pages_lock); } /* Extent update path: */ int bch2_sum_sector_overwrites(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *new, bool *usage_increasing, s64 *i_sectors_delta, s64 *disk_sectors_delta) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c old; unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; *usage_increasing = false; *i_sectors_delta = 0; *disk_sectors_delta = 0; bch2_trans_copy_iter(trans, &iter, extent_iter); for_each_btree_key_max_continue_norestart(trans, iter, new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); *i_sectors_delta += sectors * (bkey_extent_is_allocation(&new->k) - bkey_extent_is_allocation(old.k)); *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) : 0; if (!*usage_increasing && (new->k.p.snapshot != old.k->p.snapshot || new_replicas > bch2_bkey_replicas(c, old) || (!new_compressed && bch2_bkey_sectors_compressed(old)))) *usage_increasing = true; if (bkey_ge(old.k->p, new->k.p)) break; } bch2_trans_iter_exit(trans, &iter); return ret; } static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, struct btree_iter *extent_iter, u64 new_i_size, s64 i_sectors_delta) { /* * Crazy performance optimization: * Every extent update needs to also update the inode: the inode trigger * will set bi->journal_seq to the journal sequence number of this * transaction - for fsync. * * But if that's the only reason we're updating the inode (we're not * updating bi_size or bi_sectors), then we don't need the inode update * to be journalled - if we crash, the bi_journal_seq update will be * lost, but that's fine. */ unsigned inode_update_flags = BTREE_UPDATE_nojournal; struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), BTREE_ITER_intent| BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) return ret; /* * varint_decode_fast(), in the inode .invalid method, reads up to 7 * bytes past the end of the buffer: */ struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) goto err; bkey_reassemble(k_mut, k); if (unlikely(k_mut->k.type != KEY_TYPE_inode_v3)) { k_mut = bch2_inode_to_v3(trans, k_mut); ret = PTR_ERR_OR_ZERO(k_mut); if (unlikely(ret)) goto err; } struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && new_i_size > le64_to_cpu(inode->v.bi_size)) { inode->v.bi_size = cpu_to_le64(new_i_size); inode_update_flags = 0; } if (i_sectors_delta) { le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); inode_update_flags = 0; } if (inode->k.p.snapshot != iter.snapshot) { inode->k.p.snapshot = iter.snapshot; inode_update_flags = 0; } ret = bch2_trans_update(trans, &iter, &inode->k_i, BTREE_UPDATE_internal_snapshot_node| inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_extent_update(struct btree_trans *trans, subvol_inum inum, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, u64 new_i_size, s64 *i_sectors_delta_total, bool check_enospc) { struct bpos next_pos; bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; /* * This traverses us the iterator without changing iter->path->pos to * search_key() (which is pos + 1 for extents): we want there to be a * path already traversed at iter->pos because * bch2_trans_extent_update() will use it to attempt extent merging */ ret = __bch2_btree_iter_traverse(trans, iter); if (ret) return ret; ret = bch2_extent_trim_atomic(trans, iter, k); if (ret) return ret; next_pos = k->k.p; ret = bch2_sum_sector_overwrites(trans, iter, k, &usage_increasing, &i_sectors_delta, &disk_sectors_delta); if (ret) return ret; if (disk_res && disk_sectors_delta > (s64) disk_res->sectors) { ret = bch2_disk_reservation_add(trans->c, disk_res, disk_sectors_delta - disk_res->sectors, !check_enospc || !usage_increasing ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) return ret; } /* * Note: * We always have to do an inode update - even when i_size/i_sectors * aren't changing - for fsync to work properly; fsync relies on * inode->bi_journal_seq which is updated by the trigger code: */ ret = bch2_extent_update_i_size_sectors(trans, iter, min(k->k.p.offset << 9, new_i_size), i_sectors_delta) ?: bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc); if (unlikely(ret)) return ret; if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; bch2_btree_iter_set_pos(trans, iter, next_pos); return 0; } static int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; struct bkey_buf sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; subvol_inum inum = { .subvol = op->subvol, .inum = k->k.p.inode, }; int ret; BUG_ON(!inum.subvol); bch2_bkey_buf_init(&sk); do { bch2_trans_begin(trans); k = bch2_keylist_front(keys); bch2_bkey_buf_copy(&sk, c, k); ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &sk.k->k.p.snapshot); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, bkey_start_pos(&sk.k->k), BTREE_ITER_slots|BTREE_ITER_intent); ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_check_enospc); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; if (bkey_ge(iter.pos, k->k.p)) bch2_keylist_pop_front(&op->insert_keys); else bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); return ret; } /* Writes */ void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) { struct printbuf buf = PRINTBUF; if (op->subvol) { bch2_inum_offset_err_msg(op->c, &buf, (subvol_inum) { op->subvol, op->pos.inode, }, offset << 9); } else { struct bpos pos = op->pos; pos.offset = offset; bch2_inum_snap_offset_err_msg(op->c, &buf, pos); } prt_str(&buf, "write error: "); va_list args; va_start(args, fmt); prt_vprintf(&buf, fmt, args); va_end(args); if (op->flags & BCH_WRITE_move) { struct data_update *u = container_of(op, struct data_update, op); prt_printf(&buf, "\n from internal move "); bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); } bch_err_ratelimited(op->c, "%s", buf.buf); printbuf_exit(&buf); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, bool nocow) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); struct bch_write_bio *n; BUG_ON(c->opts.nochanges); bkey_for_each_ptr(ptrs, ptr) { /* * XXX: btree writes should be using io_ref[WRITE], but we * aren't retrying failed btree writes yet (due to device * removal/ro): */ struct bch_dev *ca = nocow ? bch2_dev_have_ref(c, ptr->dev) : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; n->parent = wbio; n->split = true; n->bounce = false; n->put_bio = true; n->bio.bi_opf = wbio->bio.bi_opf; bio_inc_remaining(&wbio->bio); } else { n = wbio; n->split = false; } n->c = c; n->dev = ptr->dev; n->have_ioref = ca != NULL; n->nocow = nocow; n->submit_time = local_clock(); n->inode_offset = bkey_start_offset(&k->k); if (nocow) n->nocow_bucket = PTR_BUCKET_NR(ca, ptr); n->bio.bi_iter.bi_sector = ptr->offset; if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); bio_set_dev(&n->bio, ca->disk_sb.bdev); if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { bio_endio(&n->bio); continue; } submit_bio(&n->bio); } else { n->bio.bi_status = BLK_STS_REMOVED; bio_endio(&n->bio); } } } static void __bch2_write(struct bch_write_op *); static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; EBUG_ON(op->open_buckets.nr); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); if (!(op->flags & BCH_WRITE_move)) bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); EBUG_ON(cl->parent); closure_debug_destroy(cl); if (op->end_io) op->end_io(op); } static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { struct keylist *keys = &op->insert_keys; struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); if (bkey_extent_is_direct_data(&src->k)) { bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) return -BCH_ERR_data_write_io; } if (dst != src) memmove_u64s_down(dst, src, src->k.u64s); dst = bkey_next(dst); } keys->top = dst; return 0; } /** * __bch2_write_index - after a write, update index to point to new data * @op: bch_write_op to process */ static void __bch2_write_index(struct bch_write_op *op) { struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; unsigned dev; int ret = 0; if (unlikely(op->flags & BCH_WRITE_io_error)) { ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err; } if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); ret = !(op->flags & BCH_WRITE_move) ? bch2_write_index_default(op) : bch2_data_update_index_update(op); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); BUG_ON(keylist_sectors(keys) && !ret); op->written += sectors_start - keylist_sectors(keys); if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); bch2_write_op_error(op, bkey_start_offset(&insert->k), "btree update error: %s", bch2_err_str(ret)); } if (ret) goto err; } out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; err: keys->top = keys->keys; op->error = ret; op->flags |= BCH_WRITE_submitted; goto out; } static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) { if (state != wp->state) { struct task_struct *p = current; u64 now = ktime_get_ns(); u64 runtime = p->se.sum_exec_runtime + (now - p->se.exec_start); if (state == WRITE_POINT_runnable) wp->last_runtime = runtime; else if (wp->state == WRITE_POINT_runnable) wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; if (wp->last_state_change && time_after64(now, wp->last_state_change)) wp->time[wp->state] += now - wp->last_state_change; wp->state = state; wp->last_state_change = now; } } static inline void wp_update_state(struct write_point *wp, bool running) { enum write_point_state state; state = running ? WRITE_POINT_runnable: !list_empty(&wp->writes) ? WRITE_POINT_waiting_io : WRITE_POINT_stopped; __wp_update_state(wp, state); } static CLOSURE_CALLBACK(bch2_write_index) { closure_type(op, struct bch_write_op, cl); struct write_point *wp = op->wp; struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; if ((op->flags & BCH_WRITE_submitted) && (op->flags & BCH_WRITE_move)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); spin_lock_irqsave(&wp->writes_lock, flags); if (wp->state == WRITE_POINT_waiting_io) __wp_update_state(wp, WRITE_POINT_waiting_work); list_add_tail(&op->wp_list, &wp->writes); spin_unlock_irqrestore (&wp->writes_lock, flags); queue_work(wq, &wp->index_update_work); } static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) { op->wp = wp; if (wp->state == WRITE_POINT_stopped) { spin_lock_irq(&wp->writes_lock); __wp_update_state(wp, WRITE_POINT_waiting_io); spin_unlock_irq(&wp->writes_lock); } } void bch2_write_point_do_index_updates(struct work_struct *work) { struct write_point *wp = container_of(work, struct write_point, index_update_work); struct bch_write_op *op; while (1) { spin_lock_irq(&wp->writes_lock); op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); wp_update_state(wp, op != NULL); spin_unlock_irq(&wp->writes_lock); if (!op) break; op->flags |= BCH_WRITE_in_worker; __bch2_write_index(op); if (!(op->flags & BCH_WRITE_submitted)) __bch2_write(op); else bch2_write_done(&op->cl); } } static void bch2_write_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, wbio->submit_time, !bio->bi_status); if (unlikely(bio->bi_status)) { if (ca) bch_err_inum_offset_ratelimited(ca, op->pos.inode, wbio->inode_offset << 9, "data write error: %s", bch2_blk_status_to_str(bio->bi_status)); else bch_err_inum_offset_ratelimited(c, op->pos.inode, wbio->inode_offset << 9, "data write error: %s", bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); op->flags |= BCH_WRITE_io_error; } if (wbio->nocow) { bch2_bucket_nocow_unlock(&c->nocow_locks, POS(ca->dev_idx, wbio->nocow_bucket), BUCKET_NOCOW_LOCK_UPDATE); set_bit(wbio->dev, op->devs_need_flush->d); } if (wbio->have_ioref) percpu_ref_put(&ca->io_ref[WRITE]); if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); if (wbio->put_bio) bio_put(bio); if (parent) bio_endio(&parent->bio); else closure_put(cl); } static void init_append_extent(struct bch_write_op *op, struct write_point *wp, struct bversion version, struct bch_extent_crc_unpacked crc) { struct bkey_i_extent *e; op->pos.offset += crc.uncompressed_size; e = bkey_extent_init(op->insert_keys.top); e->k.p = op->pos; e->k.size = crc.uncompressed_size; e->k.bversion = version; if (crc.csum_type || crc.compression_type || crc.nonce) bch2_extent_crc_append(&e->k_i, crc); bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_cached); bch2_keylist_push(&op->insert_keys); } static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct write_point *wp, struct bio *src, bool *page_alloc_failed, void *buf) { struct bch_write_bio *wbio; struct bio *bio; unsigned output_available = min(wp->sectors_free << 9, src->bi_iter.bi_size); unsigned pages = DIV_ROUND_UP(output_available + (buf ? ((unsigned long) buf & (PAGE_SIZE - 1)) : 0), PAGE_SIZE); pages = min(pages, BIO_MAX_VECS); bio = bio_alloc_bioset(NULL, pages, 0, GFP_NOFS, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; /* copy WRITE_SYNC flag */ wbio->bio.bi_opf = src->bi_opf; if (buf) { bch2_bio_map(bio, buf, output_available); return bio; } wbio->bounce = true; /* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: */ bch2_bio_alloc_pages_pool(c, bio, min_t(unsigned, output_available, c->opts.encoded_extent_max)); if (bio->bi_iter.bi_size < output_available) *page_alloc_failed = bch2_bio_alloc_pages(bio, output_available - bio->bi_iter.bi_size, GFP_NOFS) != 0; return bio; } static int bch2_write_rechecksum(struct bch_fs *c, struct bch_write_op *op, unsigned new_csum_type) { struct bio *bio = &op->wbio.bio; struct bch_extent_crc_unpacked new_crc; /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ if (bch2_csum_type_is_encryption(op->crc.csum_type) != bch2_csum_type_is_encryption(new_csum_type)) new_csum_type = op->crc.csum_type; int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, NULL, &new_crc, op->crc.offset, op->crc.live_size, new_csum_type); if (ret) return ret; bio_advance(bio, op->crc.offset << 9); bio->bi_iter.bi_size = op->crc.live_size << 9; op->crc = new_crc; return 0; } static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; struct bch_csum csum; int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && op->crc.compressed_size <= wp->sectors_free && (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && op->csum_type != op->crc.csum_type) { ret = bch2_write_rechecksum(c, op, op->csum_type); if (ret) return ret; } return 1; } /* * If the data is compressed and we couldn't write the entire extent as * is, we have to decompress it: */ if (crc_is_compressed(op->crc)) { /* Last point we can still verify checksum: */ struct nonce nonce = extent_nonce(op->version, op->crc); csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) goto csum_err; if (bch2_csum_type_is_encryption(op->crc.csum_type)) { ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); if (ret) return ret; op->crc.csum_type = 0; op->crc.csum = (struct bch_csum) { 0, 0 }; } ret = bch2_bio_uncompress_inplace(op, bio); if (ret) return ret; } /* * No longer have compressed data after this point - data might be * encrypted: */ /* * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data: */ if (op->crc.live_size != op->crc.uncompressed_size || op->crc.csum_type != op->csum_type) { ret = bch2_write_rechecksum(c, op, op->csum_type); if (ret) return ret; } /* * If we want to compress the data, it has to be decrypted: */ if (bch2_csum_type_is_encryption(op->crc.csum_type) && (op->compression_opt || op->crc.csum_type != op->csum_type)) { struct nonce nonce = extent_nonce(op->version, op->crc); csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) goto csum_err; ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); if (ret) return ret; op->crc.csum_type = 0; op->crc.csum = (struct bch_csum) { 0, 0 }; } return 0; csum_err: bch2_write_op_error(op, op->pos.offset, "error verifying existing checksum while moving existing data (memory corruption?)\n" " expected %0llx:%0llx got %0llx:%0llx type %s", op->crc.csum.hi, op->crc.csum.lo, csum.hi, csum.lo, op->crc.csum_type < BCH_CSUM_NR ? __bch2_csum_types[op->crc.csum_type] : "(unknown)"); return -BCH_ERR_data_write_csum; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, struct bio **_dst) { struct bch_fs *c = op->c; struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; void *ec_buf; unsigned total_output = 0, total_input = 0; bool bounce = false; bool page_alloc_failed = false; int ret, more = 0; if (op->incompressible) op->compression_opt = 0; BUG_ON(!bio_sectors(src)); ec_buf = bch2_writepoint_ec_buf(c, wp); if (unlikely(op->flags & BCH_WRITE_data_encoded)) { ret = bch2_write_prep_encoded_data(op, wp); if (ret < 0) goto err; if (ret) { if (ec_buf) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); bio_copy_data(dst, src); bounce = true; } init_append_extent(op, wp, op->version, op->crc); goto do_write; } } if (ec_buf || op->compression_opt || (op->csum_type && !(op->flags & BCH_WRITE_pages_stable)) || (bch2_csum_type_is_encryption(op->csum_type) && !(op->flags & BCH_WRITE_pages_owned))) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); bounce = true; } #ifdef CONFIG_BCACHEFS_DEBUG unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); if (!bounce && write_corrupt_ratio) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); bounce = true; } #endif saved_iter = dst->bi_iter; do { struct bch_extent_crc_unpacked crc = { 0 }; struct bversion version = op->version; size_t dst_len = 0, src_len = 0; if (page_alloc_failed && dst->bi_iter.bi_size < (wp->sectors_free << 9) && dst->bi_iter.bi_size < c->opts.encoded_extent_max) break; BUG_ON(op->compression_opt && (op->flags & BCH_WRITE_data_encoded) && bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_opt && !bounce); crc.compression_type = op->incompressible ? BCH_COMPRESSION_TYPE_incompressible : op->compression_opt ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, op->compression_opt) : 0; if (!crc_is_compressed(crc)) { dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); if (op->csum_type) dst_len = min_t(unsigned, dst_len, c->opts.encoded_extent_max); if (bounce) { swap(dst->bi_iter.bi_size, dst_len); bio_copy_data(dst, src); swap(dst->bi_iter.bi_size, dst_len); } src_len = dst_len; } BUG_ON(!src_len || !dst_len); if (bch2_csum_type_is_encryption(op->csum_type)) { if (bversion_zero(version)) { version.lo = atomic64_inc_return(&c->key_version); } else { crc.nonce = op->nonce; op->nonce += src_len >> 9; } } if ((op->flags & BCH_WRITE_data_encoded) && !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { u8 compression_type = crc.compression_type; u16 nonce = crc.nonce; /* * Note: when we're using rechecksum(), we need to be * checksumming @src because it has all the data our * existing checksum covers - if we bounced (because we * were trying to compress), @dst will only have the * part of the data the new checksum will cover. * * But normally we want to be checksumming post bounce, * because part of the reason for bouncing is so the * data can't be modified (by userspace) while it's in * flight. */ ret = bch2_rechecksum_bio(c, src, version, op->crc, &crc, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), op->csum_type); if (ret) goto err; /* * rchecksum_bio sets compression_type on crc from op->crc, * this isn't always correct as sometimes we're changing * an extent from uncompressed to incompressible. */ crc.compression_type = compression_type; crc.nonce = nonce; } else { if ((op->flags & BCH_WRITE_data_encoded) && (ret = bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), op->crc.csum_type))) goto err; crc.compressed_size = dst_len >> 9; crc.uncompressed_size = src_len >> 9; crc.live_size = src_len >> 9; swap(dst->bi_iter.bi_size, dst_len); ret = bch2_encrypt_bio(c, op->csum_type, extent_nonce(version, crc), dst); if (ret) goto err; crc.csum = bch2_checksum_bio(c, op->csum_type, extent_nonce(version, crc), dst); crc.csum_type = op->csum_type; swap(dst->bi_iter.bi_size, dst_len); } init_append_extent(op, wp, version, crc); #ifdef CONFIG_BCACHEFS_DEBUG if (write_corrupt_ratio) { swap(dst->bi_iter.bi_size, dst_len); bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); swap(dst->bi_iter.bi_size, dst_len); } #endif if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); total_output += dst_len; total_input += src_len; } while (dst->bi_iter.bi_size && src->bi_iter.bi_size && wp->sectors_free && !bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_EXTENT_U64s_MAX)); more = src->bi_iter.bi_size != 0; dst->bi_iter = saved_iter; if (dst == src && more) { BUG_ON(total_output != total_input); dst = bio_split(src, total_input >> 9, GFP_NOFS, &c->bio_write); wbio_init(dst)->put_bio = true; /* copy WRITE_SYNC flag */ dst->bi_opf = src->bi_opf; } dst->bi_iter.bi_size = total_output; do_write: *_dst = dst; return more; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); if (to_wbio(dst)->put_bio) bio_put(dst); return ret; } static bool bch2_extent_is_writeable(struct bch_write_op *op, struct bkey_s_c k) { struct bch_fs *c = op->c; struct bkey_s_c_extent e; struct extent_ptr_decoded p; const union bch_extent_entry *entry; unsigned replicas = 0; if (k.k->type != KEY_TYPE_extent) return false; e = bkey_s_c_to_extent(k); rcu_read_lock(); extent_for_each_ptr_decode(e, p, entry) { if (crc_is_encoded(p.crc) || p.has_ec) { rcu_read_unlock(); return false; } replicas += bch2_extent_ptr_durability(c, &p); } rcu_read_unlock(); return replicas >= op->opts.data_replicas; } static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *orig, struct bkey_s_c k, u64 new_i_size) { if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { /* trace this */ return 0; } struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; bch2_cut_front(bkey_start_pos(&orig->k), new); bch2_cut_back(orig->k.p, new); struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) ptr->unwritten = 0; /* * Note that we're not calling bch2_subvol_get_snapshot() in this path - * that was done when we kicked off the write, and here it's important * that we update the extent that we wrote to - even if a snapshot has * since been created. The write is still outstanding, so we're ok * w.r.t. snapshot atomicity: */ return bch2_extent_update_i_size_sectors(trans, iter, min(new->k.p.offset << 9, new_i_size), 0) ?: bch2_trans_update(trans, iter, new, BTREE_UPDATE_internal_snapshot_node); } static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); int ret = 0; for_each_keylist_key(&op->insert_keys, orig) { ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); if (ret) break; } bch2_trans_put(trans); if (ret && !bch2_err_matches(ret, EROFS)) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); bch2_write_op_error(op, bkey_start_offset(&insert->k), "btree update error: %s", bch2_err_str(ret)); } if (ret) op->error = ret; } static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { op->error = -BCH_ERR_data_write_io; } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } static CLOSURE_CALLBACK(bch2_nocow_write_done) { closure_type(op, struct bch_write_op, cl); __bch2_nocow_write_done(op); bch2_write_done(cl); } struct bucket_to_lock { struct bpos b; unsigned gen; struct nocow_lock_bucket *l; }; static void bch2_nocow_write(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets; u32 snapshot; struct bucket_to_lock *stale_at; int stale, ret; if (op->flags & BCH_WRITE_move) return; darray_init(&buckets); trans = bch2_trans_get(c); retry: bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); if (unlikely(ret)) goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(op->pos.inode, op->pos.offset, snapshot), BTREE_ITER_slots); while (1) { struct bio *bio = &op->wbio.bio; buckets.nr = 0; ret = bch2_trans_relock(trans); if (ret) break; k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) break; /* fall back to normal cow write path? */ if (unlikely(k.k->p.snapshot != snapshot || !bch2_extent_is_writeable(op, k))) break; if (bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), k.k->u64s)) break; /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); if (unlikely(!ca)) goto err_get_ioref; struct bpos b = PTR_BUCKET_POS(ca, ptr); struct nocow_lock_bucket *l = bucket_nocow_lock(&c->nocow_locks, bucket_to_u64(b)); prefetch(l); /* XXX allocating memory with btree locks held - rare */ darray_push_gfp(&buckets, ((struct bucket_to_lock) { .b = b, .gen = ptr->gen, .l = l, }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) op->flags |= BCH_WRITE_convert_unwritten; } /* Unlock before taking nocow locks, doing IO: */ bkey_reassemble(op->insert_keys.top, k); bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); if (op->flags & BCH_WRITE_convert_unwritten) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { struct bch_dev *ca = bch2_dev_have_ref(c, i->b.inode); __bch2_bucket_nocow_lock(&c->nocow_locks, i->l, bucket_to_u64(i->b), BUCKET_NOCOW_LOCK_UPDATE); int gen = bucket_gen_get(ca, i->b.offset); stale = gen < 0 ? gen : gen_after(gen, i->gen); if (unlikely(stale)) { stale_at = i; goto err_bucket_stale; } } bio = &op->wbio.bio; if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { bio = bio_split(bio, k.k->p.offset - op->pos.offset, GFP_KERNEL, &c->bio_write); wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { op->flags |= BCH_WRITE_submitted; } op->pos.offset += bio_sectors(bio); op->written += bio_sectors(bio); bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); if (op->flags & BCH_WRITE_submitted) break; bch2_btree_iter_advance(trans, &iter); } out: bch2_trans_iter_exit(trans, &iter); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; bch2_trans_put(trans); darray_exit(&buckets); if (ret) { bch2_write_op_error(op, op->pos.offset, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); op->error = ret; op->flags |= BCH_WRITE_submitted; } /* fallback to cow write path? */ if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; } else if (op->flags & BCH_WRITE_sync) { closure_sync(&op->cl); bch2_nocow_write_done(&op->cl.work); } else { /* * XXX * needs to run out of process context because ei_quota_lock is * a mutex */ continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); } return; err_get_ioref: darray_for_each(buckets, i) percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]); /* Fall back to COW path: */ goto out; err_bucket_stale: darray_for_each(buckets, i) { bch2_bucket_nocow_unlock(&c->nocow_locks, i->b, BUCKET_NOCOW_LOCK_UPDATE); if (i == stale_at) break; } struct printbuf buf = PRINTBUF; if (bch2_fs_inconsistent_on(stale < 0, c, "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = -BCH_ERR_data_write_invalid_ptr; } else { /* We can retry this: */ ret = -BCH_ERR_transaction_restart; } printbuf_exit(&buf); goto err_get_ioref; } static void __bch2_write(struct bch_write_op *op) { struct bch_fs *c = op->c; struct write_point *wp = NULL; struct bio *bio = NULL; unsigned nofs_flags; int ret; nofs_flags = memalloc_nofs_save(); if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); if (op->flags & BCH_WRITE_submitted) goto out_nofs_restore; } again: memset(&op->failed, 0, sizeof(op->failed)); do { struct bkey_i *key_to_write; unsigned key_to_write_offset = op->insert_keys.top_p - op->insert_keys.keys_p; /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > ARRAY_SIZE(op->open_buckets.v)) break; if (bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_EXTENT_U64s_MAX)) break; /* * The copygc thread is now global, which means it's no longer * freeing up space on specific disks, which means that * allocations for specific disks may hang arbitrarily long: */ ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), op->write_point, &op->devs_have, op->nr_replicas, op->nr_replicas_required, op->watermark, op->flags, &op->cl, &wp))); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; goto err; } EBUG_ON(!wp); bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { if (!(op->flags & BCH_WRITE_alloc_nowait)) bch2_write_op_error(op, op->pos.offset, "%s(): %s", __func__, bch2_err_str(ret)); op->error = ret; break; } } bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(bio->bi_private); key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, key_to_write, false); } while (ret); /* * Sync or no? * * If we're running asynchronously, wne may still want to block * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ if ((op->flags & BCH_WRITE_sync) || (!(op->flags & BCH_WRITE_submitted) && !(op->flags & BCH_WRITE_in_worker))) { bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); if (!(op->flags & BCH_WRITE_submitted)) goto again; bch2_write_done(&op->cl); } else { bch2_write_queue(op, wp); continue_at(&op->cl, bch2_write_index, NULL); } out_nofs_restore: memalloc_nofs_restore(nofs_flags); } static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) { struct bio *bio = &op->wbio.bio; struct bvec_iter iter; struct bkey_i_inline_data *id; unsigned sectors; int ret; memset(&op->failed, 0, sizeof(op->failed)); op->flags |= BCH_WRITE_wrote_data_inline; op->flags |= BCH_WRITE_submitted; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_U64s + DIV_ROUND_UP(data_len, 8)); if (ret) { op->error = ret; goto err; } sectors = bio_sectors(bio); op->pos.offset += sectors; id = bkey_inline_data_init(op->insert_keys.top); id->k.p = op->pos; id->k.bversion = op->version; id->k.size = sectors; iter = bio->bi_iter; iter.bi_size = data_len; memcpy_from_bio(id->v.data, bio, iter); while (data_len & 7) id->v.data[data_len++] = '\0'; set_bkey_val_bytes(&id->k, data_len); bch2_keylist_push(&op->insert_keys); __bch2_write_index(op); err: bch2_write_done(&op->cl); } /** * bch2_write() - handle a write to a cache device or flash only volume * @cl: &bch_write_op->cl * * This is the starting point for any data to end up in a cache device; it could * be from a normal write, or a writeback write, or a write to a flash only * volume - it's also used by the moving garbage collector to compact data in * mostly empty buckets. * * It first writes the data to the cache, creating a list of keys to be inserted * (if the data won't fit in a single open bucket, there will be multiple keys); * after the data is written it calls bch_journal, and after the keys have been * added to the next journal write they're inserted into the btree. * * If op->discard is true, instead of inserting the data it invalidates the * region of the cache represented by op->bio and op->inode. */ CLOSURE_CALLBACK(bch2_write) { closure_type(op, struct bch_write_op, cl); struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; unsigned data_len; EBUG_ON(op->cl.parent); BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); if (op->flags & BCH_WRITE_only_specified_devs) op->flags |= BCH_WRITE_alloc_nowait; op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { bch2_write_op_error(op, op->pos.offset, "misaligned write"); op->error = -BCH_ERR_data_write_misaligned; goto err; } if (c->opts.nochanges) { op->error = -BCH_ERR_erofs_no_writes; goto err; } if (!(op->flags & BCH_WRITE_move) && !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } if (!(op->flags & BCH_WRITE_move)) this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, op->new_i_size - (op->pos.offset << 9)); if (c->opts.inline_data && data_len <= min(block_bytes(c) / 2, 1024U)) { bch2_write_data_inline(op, data_len); return; } __bch2_write(op); return; err: bch2_disk_reservation_put(c, &op->res); closure_debug_destroy(&op->cl); if (op->end_io) op->end_io(op); } static const char * const bch2_write_flags[] = { #define x(f) #f, BCH_WRITE_FLAGS() #undef x NULL }; void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) { if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); printbuf_indent_add(out, 2); prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); prt_newline(out); prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); printbuf_indent_sub(out, 2); } void bch2_fs_io_write_exit(struct bch_fs *c) { mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } int bch2_fs_io_write_init(struct bch_fs *c) { if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; if (mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->opts.btree_node_size, c->opts.encoded_extent_max) / PAGE_SIZE, 0)) return -BCH_ERR_ENOMEM_bio_bounce_pages_init; return 0; }
36 159 1 159 159 1 64 64 3 161 97 47 1 166 167 29 165 34 132 33 129 163 1 162 162 32 130 48 43 5 48 163 163 22 22 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 // SPDX-License-Identifier: GPL-2.0 #include "messages.h" #include "ctree.h" #include "delalloc-space.h" #include "block-rsv.h" #include "btrfs_inode.h" #include "space-info.h" #include "qgroup.h" #include "fs.h" /* * HOW DOES THIS WORK * * There are two stages to data reservations, one for data and one for metadata * to handle the new extents and checksums generated by writing data. * * * DATA RESERVATION * The general flow of the data reservation is as follows * * -> Reserve * We call into btrfs_reserve_data_bytes() for the user request bytes that * they wish to write. We make this reservation and add it to * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree * for the range and carry on if this is buffered, or follow up trying to * make a real allocation if we are pre-allocating or doing O_DIRECT. * * -> Use * At writepages()/prealloc/O_DIRECT time we will call into * btrfs_reserve_extent() for some part or all of this range of bytes. We * will make the allocation and subtract space_info->bytes_may_use by the * original requested length and increase the space_info->bytes_reserved by * the allocated length. This distinction is important because compression * may allocate a smaller on disk extent than we previously reserved. * * -> Allocation * finish_ordered_io() will insert the new file extent item for this range, * and then add a delayed ref update for the extent tree. Once that delayed * ref is written the extent size is subtracted from * space_info->bytes_reserved and added to space_info->bytes_used. * * Error handling * * -> By the reservation maker * This is the simplest case, we haven't completed our operation and we know * how much we reserved, we can simply call * btrfs_free_reserved_data_space*() and it will be removed from * space_info->bytes_may_use. * * -> After the reservation has been made, but before cow_file_range() * This is specifically for the delalloc case. You must clear * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will * be subtracted from space_info->bytes_may_use. * * METADATA RESERVATION * The general metadata reservation lifetimes are discussed elsewhere, this * will just focus on how it is used for delalloc space. * * We keep track of two things on a per inode bases * * ->outstanding_extents * This is the number of file extent items we'll need to handle all of the * outstanding DELALLOC space we have in this inode. We limit the maximum * size of an extent, so a large contiguous dirty area may require more than * one outstanding_extent, which is why count_max_extents() is used to * determine how many outstanding_extents get added. * * ->csum_bytes * This is essentially how many dirty bytes we have for this inode, so we * can calculate the number of checksum items we would have to add in order * to checksum our outstanding data. * * We keep a per-inode block_rsv in order to make it easier to keep track of * our reservation. We use btrfs_calculate_inode_block_rsv_size() to * calculate the current theoretical maximum reservation we would need for the * metadata for this inode. We call this and then adjust our reservation as * necessary, either by attempting to reserve more space, or freeing up excess * space. * * OUTSTANDING_EXTENTS HANDLING * * ->outstanding_extents is used for keeping track of how many extents we will * need to use for this inode, and it will fluctuate depending on where you are * in the life cycle of the dirty data. Consider the following normal case for * a completely clean inode, with a num_bytes < our maximum allowed extent size * * -> reserve * ->outstanding_extents += 1 (current value is 1) * * -> set_delalloc * ->outstanding_extents += 1 (current value is 2) * * -> btrfs_delalloc_release_extents() * ->outstanding_extents -= 1 (current value is 1) * * We must call this once we are done, as we hold our reservation for the * duration of our operation, and then assume set_delalloc will update the * counter appropriately. * * -> add ordered extent * ->outstanding_extents += 1 (current value is 2) * * -> btrfs_clear_delalloc_extent * ->outstanding_extents -= 1 (current value is 1) * * -> finish_ordered_io/btrfs_remove_ordered_extent * ->outstanding_extents -= 1 (current value is 0) * * Each stage is responsible for their own accounting of the extent, thus * making error handling and cleanup easier. */ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; /* Make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, fs_info->sectorsize); if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; return btrfs_reserve_data_bytes(fs_info, bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len, bool noflush) { struct btrfs_fs_info *fs_info = inode->root->fs_info; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; int ret; /* align the range */ len = round_up(start + len, fs_info->sectorsize) - round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); if (noflush) flush = BTRFS_RESERVE_NO_FLUSH; else if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; ret = btrfs_reserve_data_bytes(fs_info, len, flush); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) { btrfs_free_reserved_data_space_noquota(fs_info, len); extent_changeset_free(*reserved); *reserved = NULL; } else { ret = 0; } return ret; } /* * Called if we need to clear a data reservation for this inode * Normally in a error case. * * This one will *NOT* use accurate qgroup reserved space API, just for case * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, u64 len) { struct btrfs_space_info *data_sinfo; ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; btrfs_space_info_free_bytes_may_use(data_sinfo, len); } /* * Called if we need to clear a data reservation for this inode * Normally in a error case. * * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; /* Make sure the range is aligned to sectorsize */ len = round_up(start + len, fs_info->sectorsize) - round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(fs_info, len); btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } /* * Release any excessive reservations for an inode. * * @inode: the inode we need to release from * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup * meta reservation needs to know if we are freeing qgroup * reservation or just converting it into per-trans. Normally * @qgroup_free is true for error handling, and false for normal * release. * * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. */ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 released = 0; u64 qgroup_to_release = 0; /* * Since we statically set the block_rsv->size we just want to say we * are releasing 0 bytes, and then we'll just get the reservation over * the size free'd. */ released = btrfs_block_rsv_release(fs_info, block_rsv, 0, &qgroup_to_release); if (released > 0) trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), released, 0); if (qgroup_free) btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); else btrfs_qgroup_convert_reserved_meta(inode->root, qgroup_to_release); } static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 reserve_size = 0; u64 qgroup_rsv_size = 0; unsigned outstanding_extents; lockdep_assert_held(&inode->lock); outstanding_extents = inode->outstanding_extents; /* * Insert size for the number of outstanding extents, 1 normal size for * updating the inode. */ if (outstanding_extents) { reserve_size = btrfs_calc_insert_metadata_size(fs_info, outstanding_extents); reserve_size += btrfs_calc_metadata_size(fs_info, 1); } if (!(inode->flags & BTRFS_INODE_NODATASUM)) { u64 csum_leaves; csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves); } /* * For qgroup rsv, the calculation is very simple: * account one nodesize for each outstanding extent * * This is overestimating in most cases. */ qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; block_rsv->qgroup_rsv_size = qgroup_rsv_size; spin_unlock(&block_rsv->lock); } static void calc_inode_reservations(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 nr_extents = count_max_extents(fs_info, num_bytes); u64 csum_leaves; u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); if (inode->flags & BTRFS_INODE_NODATASUM) csum_leaves = 0; else csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, nr_extents + csum_leaves); /* * finish_ordered_io has to update the inode, so add the space required * for an inode update. */ *meta_reserve += inode_update; *qgroup_reserve = nr_extents * fs_info->nodesize; } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, bool noflush) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv = &inode->block_rsv; u64 meta_reserve, qgroup_reserve; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; /* * If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc * mutex since we won't race with anybody. We need this mostly to make * lockdep shut its filthy mouth. * * If we have a transaction open (can happen if we call truncate_block * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ if (noflush || btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; } else { if (current->journal_info) flush = BTRFS_RESERVE_FLUSH_LIMIT; } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); /* * We always want to do it this way, every other way is wrong and ends * in tears. Pre-reserving the amount we are going to add will always * be the right way, because otherwise if we have enough parallelism we * could end up with thousands of inodes all holding little bits of * reservations they were able to make previously and the only way to * reclaim that space is to ENOSPC out the operations and clear * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ calc_inode_reservations(inode, num_bytes, disk_num_bytes, &meta_reserve, &qgroup_reserve); ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, noflush); if (ret) return ret; ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; } /* * Now we need to update our outstanding extents and csum bytes _first_ * and then add the reservation to the block_rsv. This keeps us from * racing with an ordered completion or some such that would think it * needs to free the reservation we just made. */ nr_extents = count_max_extents(fs_info, num_bytes); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, nr_extents); if (!(inode->flags & BTRFS_INODE_NODATASUM)) inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); /* Now we can safely add our space to our block rsv */ btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), meta_reserve, 1); spin_lock(&block_rsv->lock); block_rsv->qgroup_rsv_reserved += qgroup_reserve; spin_unlock(&block_rsv->lock); return 0; } /* * Release a metadata reservation for an inode. * * @inode: the inode to release the reservation for. * @num_bytes: the number of bytes we are releasing. * @qgroup_free: free qgroup reservation or convert it to per-trans reservation * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata * reservations, or on error for the same reason. */ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free) { struct btrfs_fs_info *fs_info = inode->root->fs_info; num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); if (!(inode->flags & BTRFS_INODE_NODATASUM)) inode->csum_bytes -= num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); if (btrfs_is_testing(fs_info)) return; btrfs_inode_rsv_release(inode, qgroup_free); } /* * Release our outstanding_extents for an inode. * * @inode: the inode to balance the reservation for. * @num_bytes: the number of bytes we originally reserved with * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we * have outstanding_extents to track the real usage, so we use this to free our * temporarily tracked outstanding_extents. This _must_ be used in conjunction * with btrfs_delalloc_reserve_metadata. */ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned num_extents; spin_lock(&inode->lock); num_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, -num_extents); btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); if (btrfs_is_testing(fs_info)) return; btrfs_inode_rsv_release(inode, true); } /* * Reserve data and metadata space for delalloc * * @inode: inode we're writing to * @start: start range we are writing to * @len: how long the range we are writing to * @reserved: mandatory parameter, record actually reserved qgroup ranges of * current reservation. * * This will do the following things * * - reserve space in data space info for num bytes and reserve precious * corresponding qgroup space * (Done in check_data_free_space) * * - reserve space for metadata space, based on the number of outstanding * extents and how much csums will be needed also reserve metadata space in a * per root over-reserve method. * - add to the inodes->delalloc_bytes * - add it to the fs_info's delalloc inodes list. * (Above 3 all done in delalloc_reserve_metadata) * * Return 0 for success * Return <0 for error(-ENOSPC or -EDQUOT) */ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) { int ret; ret = btrfs_check_data_free_space(inode, reserved, start, len, false); if (ret < 0) return ret; ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); *reserved = NULL; } return ret; } /* * Release data and metadata space for delalloc * * @inode: inode we're releasing space for * @reserved: list of changed/reserved ranges * @start: start position of the space already reserved * @len: length of the space already reserved * @qgroup_free: should qgroup reserved-space also be freed * * Release the metadata space that was not used and will decrement * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if * there are no delalloc bytes left. Also it will handle the qgroup reserved * space. */ void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free) { btrfs_delalloc_release_metadata(inode, len, qgroup_free); btrfs_free_reserved_data_space(inode, reserved, start, len); }
276 73 73 73 73 250 251 93 6 6 6 93 93 6 6 13 13 6 1 5 6 6 6 6 6 6 13 13 93 92 9 93 92 93 93 93 93 93 93 87 88 88 88 88 69 70 69 13 6 93 93 93 93 93 73 73 115 115 276 131 274 283 211 267 267 265 276 35 251 251 93 86 251 53 269 267 154 130 72 3 1 162 162 162 1 1 1 265 268 142 142 119 119 119 210 211 205 209 323 314 9 9 9 9 226 212 35 35 9 29 226 322 323 323 5 317 317 316 317 317 317 35 321 35 322 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_locking.h" #include "btree_types.h" static struct lock_class_key bch2_btree_node_lock_key; void bch2_btree_lock_init(struct btree_bkey_cached_common *b, enum six_lock_init_flags flags, gfp_t gfp) { __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); lockdep_set_notrack_class(&b->lock); } /* Btree node locking: */ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, struct btree_path *skip, struct btree_bkey_cached_common *b, unsigned level) { struct btree_path *path; struct six_lock_count ret; unsigned i; memset(&ret, 0, sizeof(ret)); if (IS_ERR_OR_NULL(b)) return ret; trans_for_each_path(trans, path, i) if (path != skip && &path->l[level].b->c == b) { int t = btree_node_locked_type(path, level); if (t != BTREE_NODE_UNLOCKED) ret.n[t]++; } return ret; } /* unlock */ void bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree_path *path, struct btree *b) { bch2_btree_node_unlock_write_inlined(trans, path, b); } /* lock */ /* * @trans wants to lock @b with type @type */ struct trans_waiting_for_lock { struct btree_trans *trans; struct btree_bkey_cached_common *node_want; enum six_lock_type lock_want; /* for iterating over held locks :*/ u8 path_idx; u8 level; u64 lock_start_time; }; struct lock_graph { struct trans_waiting_for_lock g[8]; unsigned nr; }; static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) { struct trans_waiting_for_lock *i; prt_printf(out, "Found lock cycle (%u entries):\n", g->nr); for (i = g->g; i < g->g + g->nr; i++) { struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (!task) continue; bch2_btree_trans_to_text(out, i->trans); bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT); } } static noinline void print_chain(struct printbuf *out, struct lock_graph *g) { struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } static void lock_graph_up(struct lock_graph *g) { closure_put(&g->g[--g->nr].trans->ref); } static noinline void lock_graph_pop_all(struct lock_graph *g) { while (g->nr) lock_graph_up(g); } static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) { while (g->g + g->nr > i) lock_graph_up(g); } static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) { g->g[g->nr++] = (struct trans_waiting_for_lock) { .trans = trans, .node_want = trans->locking, .lock_want = trans->locking_wait.lock_want, }; } static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) { closure_get(&trans->ref); __lock_graph_down(g, trans); } static bool lock_graph_remove_non_waiters(struct lock_graph *g, struct trans_waiting_for_lock *from) { struct trans_waiting_for_lock *i; if (from->trans->locking != from->node_want) { lock_graph_pop_from(g, from); return true; } for (i = from + 1; i < g->g + g->nr; i++) if (i->trans->locking != i->node_want || i->trans->locking_wait.start_time != i[-1].lock_start_time) { lock_graph_pop_from(g, i); return true; } return false; } static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans) { struct bch_fs *c = trans->c; count_event(c, trans_restart_would_deadlock); if (trace_trans_restart_would_deadlock_enabled()) { struct printbuf buf = PRINTBUF; buf.atomic++; print_cycle(&buf, g); trace_trans_restart_would_deadlock(trans, buf.buf); printbuf_exit(&buf); } } static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); return btree_trans_restart_foreign_task(i->trans, BCH_ERR_transaction_restart_would_deadlock, _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); return 0; } } static int btree_trans_abort_preference(struct btree_trans *trans) { if (trans->lock_may_not_fail) return 0; if (trans->locking_wait.lock_want == SIX_LOCK_write) return 1; if (!trans->in_traverse_all) return 2; return 3; } static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, struct trans_waiting_for_lock *from) { struct trans_waiting_for_lock *i, *abort = NULL; unsigned best = 0, pref; int ret; if (lock_graph_remove_non_waiters(g, from)) return 0; /* Only checking, for debugfs: */ if (cycle) { print_cycle(cycle, g); ret = -1; goto out; } for (i = from; i < g->g + g->nr; i++) { pref = btree_trans_abort_preference(i->trans); if (pref > best) { abort = i; best = pref; } } if (unlikely(!best)) { struct printbuf buf = PRINTBUF; buf.atomic++; prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); for (i = g->g; i < g->g + g->nr; i++) { struct btree_trans *trans = i->trans; bch2_btree_trans_to_text(&buf, trans); prt_printf(&buf, "backtrace:\n"); printbuf_indent_add(&buf, 2); bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } ret = abort_lock(g, abort); out: if (ret) lock_graph_pop_all(g); else lock_graph_pop_from(g, abort); return ret; } static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, struct printbuf *cycle) { struct btree_trans *orig_trans = g->g->trans; struct trans_waiting_for_lock *i; for (i = g->g; i < g->g + g->nr; i++) if (i->trans == trans) { closure_put(&trans->ref); return break_cycle(g, cycle, i); } if (g->nr == ARRAY_SIZE(g->g)) { closure_put(&trans->ref); if (orig_trans->lock_may_not_fail) return 0; lock_graph_pop_all(g); if (cycle) return 0; trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); } __lock_graph_down(g, trans); return 0; } static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) { return t1 + t2 > 1; } int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) { struct lock_graph g; struct trans_waiting_for_lock *top; struct btree_bkey_cached_common *b; btree_path_idx_t path_idx; int ret = 0; g.nr = 0; if (trans->lock_must_abort && !trans->lock_may_not_fail) { if (cycle) return -1; trace_would_deadlock(&g, trans); return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); } lock_graph_down(&g, trans); /* trans->paths is rcu protected vs. freeing */ rcu_read_lock(); if (cycle) cycle->atomic++; next: if (!g.nr) goto out; top = &g.g[g.nr - 1]; struct btree_path *paths = rcu_dereference(top->trans->paths); if (!paths) goto up; unsigned long *paths_allocated = trans_paths_allocated(paths); trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), path_idx, top->path_idx) { struct btree_path *path = paths + path_idx; if (!path->nodes_locked) continue; if (path_idx != top->path_idx) { top->path_idx = path_idx; top->level = 0; top->lock_start_time = 0; } for (; top->level < BTREE_MAX_DEPTH; top->level++, top->lock_start_time = 0) { int lock_held = btree_node_locked_type(path, top->level); if (lock_held == BTREE_NODE_UNLOCKED) continue; b = &READ_ONCE(path->l[top->level].b)->c; if (IS_ERR_OR_NULL(b)) { /* * If we get here, it means we raced with the * other thread updating its btree_path * structures - which means it can't be blocked * waiting on a lock: */ if (!lock_graph_remove_non_waiters(&g, g.g)) { /* * If lock_graph_remove_non_waiters() * didn't do anything, it must be * because we're being called by debugfs * checking for lock cycles, which * invokes us on btree_transactions that * aren't actually waiting on anything. * Just bail out: */ lock_graph_pop_all(&g); } goto next; } if (list_empty_careful(&b->lock.wait_list)) continue; raw_spin_lock(&b->lock.wait_lock); list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { BUG_ON(b != trans->locking); if (top->lock_start_time && time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) continue; top->lock_start_time = trans->locking_wait.start_time; /* Don't check for self deadlock: */ if (trans == top->trans || !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) continue; closure_get(&trans->ref); raw_spin_unlock(&b->lock.wait_lock); ret = lock_graph_descend(&g, trans, cycle); if (ret) goto out; goto next; } raw_spin_unlock(&b->lock.wait_lock); } } up: if (g.nr > 1 && cycle) print_chain(cycle, &g); lock_graph_up(&g); goto next; out: if (cycle) --cycle->atomic; rcu_read_unlock(); return ret; } int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) { struct btree_trans *trans = p; return bch2_check_for_deadlock(trans, NULL); } int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b, bool lock_may_not_fail) { int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; int ret; /* * Must drop our read locks before calling six_lock_write() - * six_unlock() won't do wakeups until the reader count * goes to 0, and it's safe because we have the node intent * locked: */ six_lock_readers_add(&b->lock, -readers); ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail, _RET_IP_); six_lock_readers_add(&b->lock, readers); if (ret) mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); return ret; } void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { int ret = __btree_node_lock_write(trans, path, b, true); BUG_ON(ret); } /* relock */ static inline bool btree_path_get_locks(struct btree_trans *trans, struct btree_path *path, bool upgrade, struct get_locks_fail *f) { unsigned l = path->level; int fail_idx = -1; do { if (!btree_path_node(path, l)) break; if (!(upgrade ? bch2_btree_node_upgrade(trans, path, l) : bch2_btree_node_relock(trans, path, l))) { fail_idx = l; if (f) { f->l = l; f->b = path->l[l].b; } } l++; } while (l < path->locks_want); /* * When we fail to get a lock, we have to ensure that any child nodes * can't be relocked so bch2_btree_path_traverse has to walk back up to * the node that we failed to relock: */ if (fail_idx >= 0) { __bch2_btree_path_unlock(trans, path); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); do { path->l[fail_idx].b = upgrade ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) : ERR_PTR(-BCH_ERR_no_btree_node_relock); --fail_idx; } while (fail_idx >= 0); } if (path->uptodate == BTREE_ITER_NEED_RELOCK) path->uptodate = BTREE_ITER_UPTODATE; return path->uptodate < BTREE_ITER_NEED_RELOCK; } bool __bch2_btree_node_relock(struct btree_trans *trans, struct btree_path *path, unsigned level, bool trace) { struct btree *b = btree_path_node(path, level); int want = __btree_lock_want(path, level); if (race_fault()) goto fail; if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || (btree_node_lock_seq_matches(path, b, level) && btree_node_lock_increment(trans, &b->c, level, want))) { mark_btree_node_locked(trans, path, level, want); return true; } fail: if (trace && !trans->notrace_relock_fail) trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); return false; } /* upgrade */ bool bch2_btree_node_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree *b = path->l[level].b; if (!is_btree_node(path, level)) return false; switch (btree_lock_want(path, level)) { case BTREE_NODE_UNLOCKED: BUG_ON(btree_node_locked(path, level)); return true; case BTREE_NODE_READ_LOCKED: BUG_ON(btree_node_intent_locked(path, level)); return bch2_btree_node_relock(trans, path, level); case BTREE_NODE_INTENT_LOCKED: break; case BTREE_NODE_WRITE_LOCKED: BUG(); } if (btree_node_intent_locked(path, level)) return true; if (race_fault()) return false; if (btree_node_locked(path, level) ? six_lock_tryupgrade(&b->c.lock) : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) goto success; if (btree_node_lock_seq_matches(path, b, level) && btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { btree_node_unlock(trans, path, level); goto success; } trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); return false; success: mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); return true; } /* Btree path locking: */ /* * Only for btree_cache.c - only relocks intent locks */ int bch2_btree_path_relock_intent(struct btree_trans *trans, struct btree_path *path) { unsigned l; for (l = path->level; l < path->locks_want && btree_path_node(path, l); l++) { if (!bch2_btree_node_relock(trans, path, l)) { __bch2_btree_path_unlock(trans, path); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); } } return 0; } __flatten bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { struct get_locks_fail f; bool ret = btree_path_get_locks(trans, path, false, &f); bch2_trans_verify_locks(trans); return ret; } int __bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { if (!bch2_btree_path_relock_norestart(trans, path)) { trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); } return 0; } bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want, struct get_locks_fail *f) { EBUG_ON(path->locks_want >= new_locks_want); path->locks_want = new_locks_want; bool ret = btree_path_get_locks(trans, path, true, f); bch2_trans_verify_locks(trans); return ret; } bool __bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want, struct get_locks_fail *f) { bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); if (ret) goto out; /* * XXX: this is ugly - we'd prefer to not be mucking with other * iterators in the btree_trans here. * * On failure to upgrade the iterator, setting iter->locks_want and * calling get_locks() is sufficient to make bch2_btree_path_traverse() * get the locks we want on transaction restart. * * But if this iterator was a clone, on transaction restart what we did * to this iterator isn't going to be preserved. * * Possibly we could add an iterator field for the parent iterator when * an iterator is a copy - for now, we'll just upgrade any other * iterators with the same btree id. * * The code below used to be needed to ensure ancestor nodes get locked * before interior nodes - now that's handled by * bch2_btree_path_traverse_all(). */ if (!path->cached && !trans->in_traverse_all) { struct btree_path *linked; unsigned i; trans_for_each_path(trans, linked, i) if (linked != path && linked->cached == path->cached && linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; btree_path_get_locks(trans, linked, true, NULL); } } out: bch2_trans_verify_locks(trans); return ret; } void __bch2_btree_path_downgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { unsigned l, old_locks_want = path->locks_want; if (trans->restarted) return; EBUG_ON(path->locks_want < new_locks_want); path->locks_want = new_locks_want; while (path->nodes_locked && (l = btree_path_highest_level_locked(path)) >= path->locks_want) { if (l > path->level) { btree_node_unlock(trans, path, l); } else { if (btree_node_intent_locked(path, l)) { six_lock_downgrade(&path->l[l].b->c.lock); mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); } break; } } bch2_btree_path_verify_locks(path); trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } /* Btree transaction locking: */ void bch2_trans_downgrade(struct btree_trans *trans) { struct btree_path *path; unsigned i; if (trans->restarted) return; trans_for_each_path(trans, path, i) if (path->ref) bch2_btree_path_downgrade(trans, path); } static inline void __bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, struct get_locks_fail *f, bool trace) { if (!trace) goto out; if (trace_trans_restart_relock_enabled()) { struct printbuf buf = PRINTBUF; bch2_bpos_to_text(&buf, path->pos); prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); if (IS_ERR_OR_NULL(f->b)) { prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); } else { prt_printf(&buf, "%u", f->b->c.lock.seq); struct six_lock_count c = bch2_btree_node_lock_counts(trans, NULL, &f->b->c, f->l); prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); c = six_lock_counts(&f->b->c.lock); prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); } trace_trans_restart_relock(trans, _RET_IP_, buf.buf); printbuf_exit(&buf); } count_event(trans->c, trans_restart_relock); out: __bch2_trans_unlock(trans); bch2_trans_verify_locks(trans); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) { bch2_trans_verify_locks(trans); if (unlikely(trans->restarted)) return -((int) trans->restarted); if (unlikely(trans->locked)) goto out; struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) { struct get_locks_fail f; if (path->should_be_locked && !btree_path_get_locks(trans, path, false, &f)) return bch2_trans_relock_fail(trans, path, &f, trace); } trans_set_locked(trans, true); out: bch2_trans_verify_locks(trans); return 0; } int bch2_trans_relock(struct btree_trans *trans) { return __bch2_trans_relock(trans, true); } int bch2_trans_relock_notrace(struct btree_trans *trans) { return __bch2_trans_relock(trans, false); } void bch2_trans_unlock_noassert(struct btree_trans *trans) { __bch2_trans_unlock(trans); trans_set_unlocked(trans); } void bch2_trans_unlock(struct btree_trans *trans) { __bch2_trans_unlock(trans); trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) { bch2_trans_unlock(trans); bch2_trans_srcu_unlock(trans); } void bch2_trans_unlock_write(struct btree_trans *trans) { struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) if (btree_node_write_locked(path, l)) bch2_btree_node_unlock_write(trans, path, path->l[l].b); } int __bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) { int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); if (ret) mutex_unlock(lock); return ret; } /* Debug */ #ifdef CONFIG_BCACHEFS_DEBUG void bch2_btree_path_verify_locks(struct btree_path *path) { /* * A path may be uptodate and yet have nothing locked if and only if * there is no node at path->level, which generally means we were * iterating over all nodes and got to the end of the btree */ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && btree_path_node(path, path->level) && !path->nodes_locked); if (!path->nodes_locked) return; for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); int have = btree_node_locked_type(path, l); BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); BUG_ON(is_btree_node(path, l) && (want == BTREE_NODE_UNLOCKED || have != BTREE_NODE_WRITE_LOCKED) && want != have); BUG_ON(btree_node_locked(path, l) && path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); } } static bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) if (path->nodes_locked) return true; return false; } void bch2_trans_verify_locks(struct btree_trans *trans) { if (!trans->locked) { BUG_ON(bch2_trans_locked(trans)); return; } struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) bch2_btree_path_verify_locks(path); } #endif
4665 4665 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 // SPDX-License-Identifier: GPL-2.0 /* * Detect hard and soft lockups on a system * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * * Note: Most of this code is borrowed heavily from the original softlockup * detector, so thanks to Ingo for the initial implementation. * Some chunks also taken from the old x86-specific nmi watchdog code, thanks * to those contributors as well. */ #define pr_fmt(fmt) "watchdog: " fmt #include <linux/cpu.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <linux/kernel_stat.h> #include <linux/kvm_para.h> #include <linux/math64.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> #include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/tick.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <asm/irq_regs.h> static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HARDLOCKUP_DETECTOR_SPARC64) # define WATCHDOG_HARDLOCKUP_DEFAULT 1 #else # define WATCHDOG_HARDLOCKUP_DEFAULT 0 #endif #define NUM_SAMPLE_PERIODS 5 unsigned long __read_mostly watchdog_enabled; int __read_mostly watchdog_user_enabled = 1; static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT; static int __read_mostly watchdog_softlockup_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly watchdog_hardlockup_available; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR # ifdef CONFIG_SMP int __read_mostly sysctl_hardlockup_all_cpu_backtrace; # endif /* CONFIG_SMP */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ unsigned int __read_mostly hardlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these * cases this function can be called to disable hard lockup detection. This * function should only be executed once by the boot processor before the * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ void __init hardlockup_detector_disable(void) { watchdog_hardlockup_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) { next: if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; else if (!strncmp(str, "r", 1)) hardlockup_config_perf_event(str + 1); while (*(str++)) { if (*str == ',') { str++; goto next; } } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); #endif /* CONFIG_HARDLOCKUP_DETECTOR */ #if defined(CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER) static DEFINE_PER_CPU(atomic_t, hrtimer_interrupts); static DEFINE_PER_CPU(int, hrtimer_interrupts_saved); static DEFINE_PER_CPU(bool, watchdog_hardlockup_warned); static DEFINE_PER_CPU(bool, watchdog_hardlockup_touched); static unsigned long hard_lockup_nmi_warn; notrace void arch_touch_nmi_watchdog(void) { /* * Using __raw here because some code paths have * preemption enabled. If preemption is enabled * then interrupts should be enabled too, in which * case we shouldn't have to worry about the watchdog * going off. */ raw_cpu_write(watchdog_hardlockup_touched, true); } EXPORT_SYMBOL(arch_touch_nmi_watchdog); void watchdog_hardlockup_touch_cpu(unsigned int cpu) { per_cpu(watchdog_hardlockup_touched, cpu) = true; } static bool is_hardlockup(unsigned int cpu) { int hrint = atomic_read(&per_cpu(hrtimer_interrupts, cpu)); if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) return true; /* * NOTE: we don't need any fancy atomic_t or READ_ONCE/WRITE_ONCE * for hrtimer_interrupts_saved. hrtimer_interrupts_saved is * written/read by a single CPU. */ per_cpu(hrtimer_interrupts_saved, cpu) = hrint; return false; } static void watchdog_hardlockup_kick(void) { int new_interrupts; new_interrupts = atomic_inc_return(this_cpu_ptr(&hrtimer_interrupts)); watchdog_buddy_check_hardlockup(new_interrupts); } void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) { if (per_cpu(watchdog_hardlockup_touched, cpu)) { per_cpu(watchdog_hardlockup_touched, cpu) = false; return; } /* * Check for a hardlockup by making sure the CPU's timer * interrupt is incrementing. The timer interrupt should have * fired multiple times before we overflow'd. If it hasn't * then this is a good indication the cpu is stuck */ if (is_hardlockup(cpu)) { unsigned int this_cpu = smp_processor_id(); unsigned long flags; /* Only print hardlockups once. */ if (per_cpu(watchdog_hardlockup_warned, cpu)) return; /* * Prevent multiple hard-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ if (sysctl_hardlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &hard_lockup_nmi_warn)) return; } /* * NOTE: we call printk_cpu_sync_get_irqsave() after printing * the lockup message. While it would be nice to serialize * that printout, we really want to make sure that if some * other CPU somehow locked up while holding the lock associated * with printk_cpu_sync_get_irqsave() that we can still at least * get the message about the lockup out. */ pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu); printk_cpu_sync_get_irqsave(flags); print_modules(); print_irqtrace_events(current); if (cpu == this_cpu) { if (regs) show_regs(regs); else dump_stack(); printk_cpu_sync_put_irqrestore(flags); } else { printk_cpu_sync_put_irqrestore(flags); trigger_single_cpu_backtrace(cpu); } if (sysctl_hardlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(cpu); if (!hardlockup_panic) clear_bit_unlock(0, &hard_lockup_nmi_warn); } if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); per_cpu(watchdog_hardlockup_warned, cpu) = true; } else { per_cpu(watchdog_hardlockup_warned, cpu) = false; } } #else /* CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ static inline void watchdog_hardlockup_kick(void) { } #endif /* !CONFIG_HARDLOCKUP_DETECTOR_COUNTS_HRTIMER */ /* * These functions can be overridden based on the configured hardlockdup detector. * * watchdog_hardlockup_enable/disable can be implemented to start and stop when * softlockup watchdog start and stop. The detector must select the * SOFTLOCKUP_DETECTOR Kconfig. */ void __weak watchdog_hardlockup_enable(unsigned int cpu) { } void __weak watchdog_hardlockup_disable(unsigned int cpu) { } /* * Watchdog-detector specific API. * * Return 0 when hardlockup watchdog is available, negative value otherwise. * Note that the negative value means that a delayed probe might * succeed later. */ int __weak __init watchdog_hardlockup_probe(void) { return -ENODEV; } /** * watchdog_hardlockup_stop - Stop the watchdog for reconfiguration * * The reconfiguration steps are: * watchdog_hardlockup_stop(); * update_variables(); * watchdog_hardlockup_start(); */ void __weak watchdog_hardlockup_stop(void) { } /** * watchdog_hardlockup_start - Start the watchdog after reconfiguration * * Counterpart to watchdog_hardlockup_stop(). * * The following variables have been updated in update_variables() and * contain the currently valid configuration: * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask */ void __weak watchdog_hardlockup_start(void) { } /** * lockup_detector_update_enable - Update the sysctl enable bit * * Caller needs to make sure that the hard watchdogs are off, so this * can't race with watchdog_hardlockup_disable(). */ static void lockup_detector_update_enable(void) { watchdog_enabled = 0; if (!watchdog_user_enabled) return; if (watchdog_hardlockup_available && watchdog_hardlockup_user_enabled) watchdog_enabled |= WATCHDOG_HARDLOCKUP_ENABLED; if (watchdog_softlockup_user_enabled) watchdog_enabled |= WATCHDOG_SOFTOCKUP_ENABLED; } #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * Delay the soflockup report when running a known slow code. * It does _not_ affect the timestamp of the last successdul reschedule. */ #define SOFTLOCKUP_DELAY_REPORT ULONG_MAX #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; /* Timestamp taken after the last successful reschedule. */ static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); /* Timestamp of the last softlockup report. */ static DEFINE_PER_CPU(unsigned long, watchdog_report_ts); static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static unsigned long soft_lockup_nmi_warn; static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); return 1; } __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { watchdog_softlockup_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); static int __init watchdog_thresh_setup(char *str) { get_option(&str, &watchdog_thresh); return 1; } __setup("watchdog_thresh=", watchdog_thresh_setup); #ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM enum stats_per_group { STATS_SYSTEM, STATS_SOFTIRQ, STATS_HARDIRQ, STATS_IDLE, NUM_STATS_PER_GROUP, }; static const enum cpu_usage_stat tracked_stats[NUM_STATS_PER_GROUP] = { CPUTIME_SYSTEM, CPUTIME_SOFTIRQ, CPUTIME_IRQ, CPUTIME_IDLE, }; static DEFINE_PER_CPU(u16, cpustat_old[NUM_STATS_PER_GROUP]); static DEFINE_PER_CPU(u8, cpustat_util[NUM_SAMPLE_PERIODS][NUM_STATS_PER_GROUP]); static DEFINE_PER_CPU(u8, cpustat_tail); /* * We don't need nanosecond resolution. A granularity of 16ms is * sufficient for our precision, allowing us to use u16 to store * cpustats, which will roll over roughly every ~1000 seconds. * 2^24 ~= 16 * 10^6 */ static u16 get_16bit_precision(u64 data_ns) { return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ } static void update_cpustat(void) { int i; u8 util; u16 old_stat, new_stat; struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u8 tail = __this_cpu_read(cpustat_tail); u16 sample_period_16 = get_16bit_precision(sample_period); kcpustat_cpu_fetch(&kcpustat, smp_processor_id()); for (i = 0; i < NUM_STATS_PER_GROUP; i++) { old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } __this_cpu_write(cpustat_tail, (tail + 1) % NUM_SAMPLE_PERIODS); } static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); u64 sample_period_second = sample_period; do_div(sample_period_second, NSEC_PER_SEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", smp_processor_id(), sample_period_second); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; printk(KERN_CRIT "\t#%d: %3u%% system,\t%3u%% softirq,\t" "%3u%% hardirq,\t%3u%% idle\n", i + 1, __this_cpu_read(cpustat_util[group][STATS_SYSTEM]), __this_cpu_read(cpustat_util[group][STATS_SOFTIRQ]), __this_cpu_read(cpustat_util[group][STATS_HARDIRQ]), __this_cpu_read(cpustat_util[group][STATS_IDLE])); } } #define HARDIRQ_PERCENT_THRESH 50 #define NUM_HARDIRQ_REPORT 5 struct irq_counts { int irq; u32 counts; }; static DEFINE_PER_CPU(bool, snapshot_taken); /* Tabulate the most frequent interrupts. */ static void tabulate_irq_count(struct irq_counts *irq_counts, int irq, u32 counts, int rank) { int i; struct irq_counts new_count = {irq, counts}; for (i = 0; i < rank; i++) { if (counts > irq_counts[i].counts) swap(new_count, irq_counts[i]); } } /* * If the hardirq time exceeds HARDIRQ_PERCENT_THRESH% of the sample_period, * then the cause of softlockup might be interrupt storm. In this case, it * would be useful to start interrupt counting. */ static bool need_counting_irqs(void) { u8 util; int tail = __this_cpu_read(cpustat_tail); tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT; util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); return util > HARDIRQ_PERCENT_THRESH; } static void start_counting_irqs(void) { if (!__this_cpu_read(snapshot_taken)) { kstat_snapshot_irqs(); __this_cpu_write(snapshot_taken, true); } } static void stop_counting_irqs(void) { __this_cpu_write(snapshot_taken, false); } static void print_irq_counts(void) { unsigned int i, count; struct irq_counts irq_counts_sorted[NUM_HARDIRQ_REPORT] = { {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0}, {-1, 0} }; if (__this_cpu_read(snapshot_taken)) { for_each_active_irq(i) { count = kstat_get_irq_since_snapshot(i); tabulate_irq_count(irq_counts_sorted, i, count, NUM_HARDIRQ_REPORT); } /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ printk(KERN_CRIT "CPU#%d Detect HardIRQ Time exceeds %d%%. Most frequent HardIRQs:\n", smp_processor_id(), HARDIRQ_PERCENT_THRESH); for (i = 0; i < NUM_HARDIRQ_REPORT; i++) { if (irq_counts_sorted[i].irq == -1) break; printk(KERN_CRIT "\t#%u: %-10u\tirq#%d\n", i + 1, irq_counts_sorted[i].counts, irq_counts_sorted[i].irq); } /* * If the hardirq time is less than HARDIRQ_PERCENT_THRESH% in the last * sample_period, then we suspect the interrupt storm might be subsiding. */ if (!need_counting_irqs()) stop_counting_irqs(); } } static void report_cpu_status(void) { print_cpustat(); print_irq_counts(); } #else static inline void update_cpustat(void) { } static inline void report_cpu_status(void) { } static inline bool need_counting_irqs(void) { return false; } static inline void start_counting_irqs(void) { } static inline void stop_counting_irqs(void) { } #endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally * want a higher threshold for soft lockups than for hard lockups. So we couple * the thresholds with a factor: we make the soft threshold twice the amount of * time the hard threshold is. */ static int get_softlockup_thresh(void) { return watchdog_thresh * 2; } /* * Returns seconds, approximately. We don't need nanosecond * resolution, and we don't need to waste time with a big divide when * 2^30ns == 1.074s. */ static unsigned long get_timestamp(void) { return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) { /* * convert watchdog_thresh from seconds to ns * the divide by 5 is to give hrtimer several chances (two * or three with the current relation between the soft * and hard thresholds) to increment before the * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / NUM_SAMPLE_PERIODS); watchdog_update_hrtimer_threshold(sample_period); } static void update_report_ts(void) { __this_cpu_write(watchdog_report_ts, get_timestamp()); } /* Commands for resetting the watchdog */ static void update_touch_ts(void) { __this_cpu_write(watchdog_touch_ts, get_timestamp()); update_report_ts(); } /** * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls * * Call when the scheduler may have stalled for legitimate reasons * preventing the watchdog task from executing - e.g. the scheduler * entering idle state. This should only be used for scheduler events. * Use touch_softlockup_watchdog() for everything else. */ notrace void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's watchdog * report period gets restarted here, so use the raw_ operation. */ raw_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } notrace void touch_softlockup_watchdog(void) { touch_softlockup_watchdog_sched(); wq_watchdog_touch(raw_smp_processor_id()); } EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) { int cpu; /* * watchdog_mutex cannpt be taken here, as this might be called * from (soft)interrupt context, so the access to * watchdog_allowed_cpumask might race with a concurrent update. * * The watchdog time stamp can race against a concurrent real * update as well, the only side effect might be a cycle delay for * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) { per_cpu(watchdog_report_ts, cpu) = SOFTLOCKUP_DELAY_REPORT; wq_watchdog_touch(cpu); } } void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } static int is_softlockup(unsigned long touch_ts, unsigned long period_ts, unsigned long now) { if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) { /* * If period_ts has not been updated during a sample_period, then * in the subsequent few sample_periods, period_ts might also not * be updated, which could indicate a potential softlockup. In * this case, if we suspect the cause of the potential softlockup * might be interrupt storm, then we need to count the interrupts * to find which interrupt is storming. */ if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) && need_counting_irqs()) start_counting_irqs(); /* * A poorly behaving BPF scheduler can live-lock the system into * soft lockups. Tell sched_ext to try ejecting the BPF * scheduler when close to a soft lockup. */ if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4)) scx_softlockup(now - touch_ts); /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) return now - touch_ts; } return 0; } /* watchdog detector functions */ static DEFINE_PER_CPU(struct completion, softlockup_completion); static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); /* * The watchdog feed function - touches the timestamp. * * It only runs once every sample_period seconds (4 seconds by * default) to reset the softlockup timestamp. If this gets delayed * for more than 2*watchdog_thresh seconds then the debug-printout * triggers in watchdog_timer_fn(). */ static int softlockup_fn(void *data) { update_touch_ts(); stop_counting_irqs(); complete(this_cpu_ptr(&softlockup_completion)); return 0; } /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; unsigned long flags; if (!watchdog_enabled) return HRTIMER_NORESTART; watchdog_hardlockup_kick(); /* kick the softlockup detector */ if (completion_done(this_cpu_ptr(&softlockup_completion))) { reinit_completion(this_cpu_ptr(&softlockup_completion)); stop_one_cpu_nowait(smp_processor_id(), softlockup_fn, NULL, this_cpu_ptr(&softlockup_stop_work)); } /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); /* * Read the current timestamp first. It might become invalid anytime * when a virtual machine is stopped by the host or when the watchog * is touched from NMI. */ now = get_timestamp(); /* * If a virtual machine is stopped by the host it can look to * the watchdog like a soft lockup. This function touches the watchdog. */ kvm_check_and_clear_guest_paused(); /* * The stored timestamp is comparable with @now only when not touched. * It might get touched anytime from NMI. Make sure that is_softlockup() * uses the same (valid) value. */ period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts)); update_cpustat(); /* Reset the interval when touched by known problematic code. */ if (period_ts == SOFTLOCKUP_DELAY_REPORT) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically * make sure the scheduler tick is up to date. */ __this_cpu_write(softlockup_touch_sync, false); sched_clock_tick(); } update_report_ts(); return HRTIMER_RESTART; } /* Check for a softlockup. */ touch_ts = __this_cpu_read(watchdog_touch_ts); duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { /* * Prevent multiple soft-lockup reports if one cpu is already * engaged in dumping all cpu back traces. */ if (softlockup_all_cpu_backtrace) { if (test_and_set_bit_lock(0, &soft_lockup_nmi_warn)) return HRTIMER_RESTART; } /* Start period for the next softlockup warning. */ update_report_ts(); printk_cpu_sync_get_irqsave(flags); pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); report_cpu_status(); print_modules(); print_irqtrace_events(current); if (regs) show_regs(regs); else dump_stack(); printk_cpu_sync_put_irqrestore(flags); if (softlockup_all_cpu_backtrace) { trigger_allbutcpu_cpu_backtrace(smp_processor_id()); if (!softlockup_panic) clear_bit_unlock(0, &soft_lockup_nmi_warn); } add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); } return HRTIMER_RESTART; } static void watchdog_enable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); struct completion *done = this_cpu_ptr(&softlockup_completion); WARN_ON_ONCE(cpu != smp_processor_id()); init_completion(done); complete(done); /* * Start the timer first to prevent the hardlockup watchdog triggering * before the timer has a chance to fire. */ hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED_HARD); /* Initialize timestamp */ update_touch_ts(); /* Enable the hardlockup detector */ if (watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED) watchdog_hardlockup_enable(cpu); } static void watchdog_disable(unsigned int cpu) { struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); WARN_ON_ONCE(cpu != smp_processor_id()); /* * Disable the hardlockup detector first. That prevents that a large * delay between disabling the timer and disabling the hardlockup * detector causes a false positive. */ watchdog_hardlockup_disable(cpu); hrtimer_cancel(hrtimer); wait_for_completion(this_cpu_ptr(&softlockup_completion)); } static int softlockup_stop_fn(void *data) { watchdog_disable(smp_processor_id()); return 0; } static void softlockup_stop_all(void) { int cpu; if (!softlockup_initialized) return; for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false); cpumask_clear(&watchdog_allowed_mask); } static int softlockup_start_fn(void *data) { watchdog_enable(smp_processor_id()); return 0; } static void softlockup_start_all(void) { int cpu; cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); for_each_cpu(cpu, &watchdog_allowed_mask) smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false); } int lockup_detector_online_cpu(unsigned int cpu) { if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) watchdog_enable(cpu); return 0; } int lockup_detector_offline_cpu(unsigned int cpu) { if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) watchdog_disable(cpu); return 0; } static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_hardlockup_stop(); softlockup_stop_all(); set_sample_period(); lockup_detector_update_enable(); if (watchdog_enabled && watchdog_thresh) softlockup_start_all(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { mutex_lock(&watchdog_mutex); __lockup_detector_reconfigure(); mutex_unlock(&watchdog_mutex); } /* * Create the watchdog infrastructure and configure the detector(s). */ static __init void lockup_detector_setup(void) { /* * If sysctl is off and watchdog got disabled on the command line, * nothing to do here. */ lockup_detector_update_enable(); if (!IS_ENABLED(CONFIG_SYSCTL) && !(watchdog_enabled && watchdog_thresh)) return; mutex_lock(&watchdog_mutex); __lockup_detector_reconfigure(); softlockup_initialized = true; mutex_unlock(&watchdog_mutex); } #else /* CONFIG_SOFTLOCKUP_DETECTOR */ static void __lockup_detector_reconfigure(void) { cpus_read_lock(); watchdog_hardlockup_stop(); lockup_detector_update_enable(); watchdog_hardlockup_start(); cpus_read_unlock(); } void lockup_detector_reconfigure(void) { __lockup_detector_reconfigure(); } static inline void lockup_detector_setup(void) { __lockup_detector_reconfigure(); } #endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ /** * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) * * Special interface for parisc. It prevents lockup detector warnings from * the default pm_poweroff() function which busy loops forever. */ void lockup_detector_soft_poweroff(void) { watchdog_enabled = 0; } #ifdef CONFIG_SYSCTL /* Propagate any changes to the watchdog infrastructure */ static void proc_watchdog_update(void) { /* Remove impossible cpus to keep sysctl output clean. */ cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); __lockup_detector_reconfigure(); } /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * * caller | table->data points to | 'which' * -------------------|----------------------------------|------------------------------- * proc_watchdog | watchdog_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED | * | | WATCHDOG_SOFTOCKUP_ENABLED * -------------------|----------------------------------|------------------------------- * proc_nmi_watchdog | watchdog_hardlockup_user_enabled | WATCHDOG_HARDLOCKUP_ENABLED * -------------------|----------------------------------|------------------------------- * proc_soft_watchdog | watchdog_softlockup_user_enabled | WATCHDOG_SOFTOCKUP_ENABLED */ static int proc_watchdog_common(int which, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; mutex_lock(&watchdog_mutex); old = *param; if (!write) { /* * On read synchronize the userspace interface. This is a * racy snapshot. */ *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); *param = old; } else { err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && old != READ_ONCE(*param)) proc_watchdog_update(); } mutex_unlock(&watchdog_mutex); return err; } /* * /proc/sys/kernel/watchdog */ static int proc_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED | WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } /* * /proc/sys/kernel/nmi_watchdog */ static int proc_nmi_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!watchdog_hardlockup_available && write) return -ENOTSUPP; return proc_watchdog_common(WATCHDOG_HARDLOCKUP_ENABLED, table, write, buffer, lenp, ppos); } #ifdef CONFIG_SOFTLOCKUP_DETECTOR /* * /proc/sys/kernel/soft_watchdog */ static int proc_soft_watchdog(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(WATCHDOG_SOFTOCKUP_ENABLED, table, write, buffer, lenp, ppos); } #endif /* * /proc/sys/kernel/watchdog_thresh */ static int proc_watchdog_thresh(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err, old; mutex_lock(&watchdog_mutex); old = READ_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!err && write && old != READ_ONCE(watchdog_thresh)) proc_watchdog_update(); mutex_unlock(&watchdog_mutex); return err; } /* * The cpumask is the mask of possible cpus that the watchdog can run * on, not the mask of cpus it is actually running on. This allows the * user to specify a mask that will include cpus that have not yet * been brought online, if desired. */ static int proc_watchdog_cpumask(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err; mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); if (!err && write) proc_watchdog_update(); mutex_unlock(&watchdog_mutex); return err; } static const int sixty = 60; static const struct ctl_table watchdog_sysctls[] = { { .procname = "watchdog", .data = &watchdog_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "watchdog_thresh", .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_watchdog_thresh, .extra1 = SYSCTL_ZERO, .extra2 = (void *)&sixty, }, { .procname = "watchdog_cpumask", .data = &watchdog_cpumask_bits, .maxlen = NR_CPUS, .mode = 0644, .proc_handler = proc_watchdog_cpumask, }, #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", .data = &watchdog_softlockup_user_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "softlockup_panic", .data = &softlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { .procname = "softlockup_all_cpu_backtrace", .data = &sysctl_softlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif #ifdef CONFIG_HARDLOCKUP_DETECTOR { .procname = "hardlockup_panic", .data = &hardlockup_panic, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #ifdef CONFIG_SMP { .procname = "hardlockup_all_cpu_backtrace", .data = &sysctl_hardlockup_all_cpu_backtrace, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif /* CONFIG_SMP */ #endif }; static struct ctl_table watchdog_hardlockup_sysctl[] = { { .procname = "nmi_watchdog", .data = &watchdog_hardlockup_user_enabled, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_nmi_watchdog, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; static void __init watchdog_sysctl_init(void) { register_sysctl_init("kernel", watchdog_sysctls); if (watchdog_hardlockup_available) watchdog_hardlockup_sysctl[0].mode = 0644; register_sysctl_init("kernel", watchdog_hardlockup_sysctl); } #else #define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static void __init lockup_detector_delay_init(struct work_struct *work); static bool allow_lockup_detector_init_retry __initdata; static struct work_struct detector_work __initdata = __WORK_INITIALIZER(detector_work, lockup_detector_delay_init); static void __init lockup_detector_delay_init(struct work_struct *work) { int ret; ret = watchdog_hardlockup_probe(); if (ret) { if (ret == -ENODEV) pr_info("NMI not fully supported\n"); else pr_info("Delayed init of the lockup detector failed: %d\n", ret); pr_info("Hard watchdog permanently disabled\n"); return; } allow_lockup_detector_init_retry = false; watchdog_hardlockup_available = true; lockup_detector_setup(); } /* * lockup_detector_retry_init - retry init lockup detector if possible. * * Retry hardlockup detector init. It is useful when it requires some * functionality that has to be initialized later on a particular * platform. */ void __init lockup_detector_retry_init(void) { /* Must be called before late init calls */ if (!allow_lockup_detector_init_retry) return; schedule_work(&detector_work); } /* * Ensure that optional delayed hardlockup init is proceed before * the init code and memory is freed. */ static int __init lockup_detector_check(void) { /* Prevent any later retry. */ allow_lockup_detector_init_retry = false; /* Make sure no work is pending. */ flush_work(&detector_work); watchdog_sysctl_init(); return 0; } late_initcall_sync(lockup_detector_check); void __init lockup_detector_init(void) { if (tick_nohz_full_enabled()) pr_info("Disabling watchdog on nohz_full cores by default\n"); cpumask_copy(&watchdog_cpumask, housekeeping_cpumask(HK_TYPE_TIMER)); if (!watchdog_hardlockup_probe()) watchdog_hardlockup_available = true; else allow_lockup_detector_init_retry = true; lockup_detector_setup(); }
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; rc = -EINVAL; if (len < 0) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } net_dbg_ratelimited("x25_sendmsg: Built header.\n"); net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25);
27 7 75 10 81 37 37 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 // SPDX-License-Identifier: GPL-2.0 #include <linux/buffer_head.h> #include <linux/slab.h> #include "minix.h" enum {DEPTH = 3, DIRECT = 7}; /* Only double indirect */ typedef u16 block_t; /* 16 bit, host order */ static inline unsigned long block_to_cpu(block_t n) { return n; } static inline block_t cpu_to_block(unsigned long n) { return n; } static inline block_t *i_data(struct inode *inode) { return (block_t *)minix_i(inode)->u.i1_data; } static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; if (block < 0) { printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", block, inode->i_sb->s_bdev); return 0; } if ((u64)block * BLOCK_SIZE >= inode->i_sb->s_maxbytes) return 0; if (block < 7) { offsets[n++] = block; } else if ((block -= 7) < 512) { offsets[n++] = 7; offsets[n++] = block; } else { block -= 512; offsets[n++] = 8; offsets[n++] = block>>9; offsets[n++] = block & 511; } return n; } #include "itree_common.c" int V1_minix_get_block(struct inode * inode, long block, struct buffer_head *bh_result, int create) { return get_block(inode, block, bh_result, create); } void V1_minix_truncate(struct inode * inode) { truncate(inode); } unsigned V1_minix_blocks(loff_t size, struct super_block *sb) { return nblocks(size, sb); }
21 13 2 13 13 16 1 5 15 15 16 15 5 15 4 2 4 2 2 2 12 12 6 6 1 6 6 6 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 // SPDX-License-Identifier: GPL-2.0-or-later /* AFS dynamic root handling * * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/fs.h> #include <linux/namei.h> #include <linux/dns_resolver.h> #include "internal.h" #define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */ #define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX) static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino); /* * iget5() comparator for inode created by autocell operations */ static int afs_iget5_pseudo_test(struct inode *inode, void *opaque) { struct afs_fid *fid = opaque; return inode->i_ino == fid->vnode; } /* * iget5() inode initialiser */ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque) { struct afs_super_info *as = AFS_FS_S(inode->i_sb); struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_fid *fid = opaque; vnode->volume = as->volume; vnode->fid = *fid; inode->i_ino = fid->vnode; inode->i_generation = fid->unique; return 0; } /* * Create an inode for an autocell dynamic automount dir. */ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) { struct afs_vnode *vnode; struct inode *inode; struct afs_fid fid = { .vnode = ino, .unique = 1, }; _enter(""); inode = iget5_locked(sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); if (!inode) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", inode, inode->i_ino, fid.vid, fid.vnode, fid.unique); vnode = AFS_FS_I(inode); if (inode->i_state & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); inode->i_size = 0; inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_autocell_inode_operations; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_blocks = 0; inode->i_generation = 0; inode->i_flags |= S_AUTOMOUNT | S_NOATIME; set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); unlock_new_inode(inode); } _leave(" = %p", inode); return inode; } /* * Try to automount the mountpoint with pseudo directory, if the autocell * option is set. */ static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct afs_cell *cell = NULL; struct afs_net *net = afs_d2net(dentry); struct inode *inode = NULL; const char *name = dentry->d_name.name; size_t len = dentry->d_name.len; bool dotted = false; int ret = -ENOENT; /* Names prefixed with a dot are R/W mounts. */ if (name[0] == '.') { name++; len--; dotted = true; } cell = afs_lookup_cell(net, name, len, NULL, false, afs_cell_trace_use_lookup_dynroot); if (IS_ERR(cell)) { ret = PTR_ERR(cell); goto out_no_cell; } inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out; } dentry->d_fsdata = cell; return d_splice_alias(inode, dentry); out: afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot); out_no_cell: if (!inode) return d_splice_alias(inode, dentry); return ret == -ENOENT ? NULL : ERR_PTR(ret); } /* * Look up an entry in a dynroot directory. */ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { _enter("%pd", dentry); if (flags & LOOKUP_CREATE) return ERR_PTR(-EOPNOTSUPP); if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } if (dentry->d_name.len == 5 && memcmp(dentry->d_name.name, "@cell", 5) == 0) return afs_lookup_atcell(dir, dentry, 2); if (dentry->d_name.len == 6 && memcmp(dentry->d_name.name, ".@cell", 6) == 0) return afs_lookup_atcell(dir, dentry, 3); return afs_dynroot_lookup_cell(dir, dentry, flags); } const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; static void afs_dynroot_d_release(struct dentry *dentry) { struct afs_cell *cell = dentry->d_fsdata; afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt); } /* * Keep @cell symlink dentries around, but only keep cell autodirs when they're * being used. */ static int afs_dynroot_delete_dentry(const struct dentry *dentry) { const struct qstr *name = &dentry->d_name; if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0) return 0; if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0) return 0; return 1; } const struct dentry_operations afs_dynroot_dentry_operations = { .d_delete = afs_dynroot_delete_dentry, .d_release = afs_dynroot_d_release, .d_automount = afs_d_automount, }; static void afs_atcell_delayed_put_cell(void *arg) { struct afs_cell *cell = arg; afs_put_cell(cell, afs_cell_trace_put_atcell); } /* * Read @cell or .@cell symlinks. */ static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_cell *cell; struct afs_net *net = afs_i2net(inode); const char *name; bool dotted = vnode->fid.vnode == 3; if (!rcu_access_pointer(net->ws_cell)) return ERR_PTR(-ENOENT); if (!dentry) { /* We're in RCU-pathwalk. */ cell = rcu_dereference(net->ws_cell); if (dotted) name = cell->name - 1; else name = cell->name; /* Shouldn't need to set a delayed call. */ return name; } down_read(&net->cells_lock); cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); if (dotted) name = cell->name - 1; else name = cell->name; afs_get_cell(cell, afs_cell_trace_get_atcell); set_delayed_call(done, afs_atcell_delayed_put_cell, cell); up_read(&net->cells_lock); return name; } static const struct inode_operations afs_atcell_inode_operations = { .get_link = afs_atcell_get_link, }; /* * Create an inode for the @cell or .@cell symlinks. */ static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino) { struct afs_vnode *vnode; struct inode *inode; struct afs_fid fid = { .vnode = ino, .unique = 1, }; inode = iget5_locked(dir->i_sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); if (!inode) return ERR_PTR(-ENOMEM); vnode = AFS_FS_I(inode); if (inode->i_state & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 1); inode->i_size = 0; inode->i_mode = S_IFLNK | 0555; inode->i_op = &afs_atcell_inode_operations; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_blocks = 0; inode->i_generation = 0; inode->i_flags |= S_NOATIME; unlock_new_inode(inode); } return d_splice_alias(inode, dentry); } /* * Transcribe the cell database into readdir content under the RCU read lock. * Each cell produces two entries, one prefixed with a dot and one not. */ static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx) { const struct afs_cell *cell; loff_t newpos; _enter("%llu", ctx->pos); for (;;) { unsigned int ix = ctx->pos >> 1; cell = idr_get_next(&net->cells_dyn_ino, &ix); if (!cell) return 0; if (READ_ONCE(cell->state) == AFS_CELL_REMOVING || READ_ONCE(cell->state) == AFS_CELL_DEAD) { ctx->pos += 2; ctx->pos &= ~1; continue; } newpos = ix << 1; if (newpos > ctx->pos) ctx->pos = newpos; _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino); if ((ctx->pos & 1) == 0) { if (!dir_emit(ctx, cell->name, cell->name_len, cell->dynroot_ino, DT_DIR)) return 0; ctx->pos++; } if ((ctx->pos & 1) == 1) { if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1, cell->dynroot_ino + 1, DT_DIR)) return 0; ctx->pos++; } } return 0; } /* * Read the AFS dynamic root directory. This produces a list of cellnames, * dotted and undotted, along with @cell and .@cell links if configured. */ static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx) { struct afs_net *net = afs_d2net(file->f_path.dentry); int ret = 0; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) { if (rcu_access_pointer(net->ws_cell) && !dir_emit(ctx, "@cell", 5, 2, DT_LNK)) return 0; ctx->pos = 3; } if (ctx->pos == 3) { if (rcu_access_pointer(net->ws_cell) && !dir_emit(ctx, ".@cell", 6, 3, DT_LNK)) return 0; ctx->pos = 4; } if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { rcu_read_lock(); ret = afs_dynroot_readdir_cells(net, ctx); rcu_read_unlock(); } return ret; } static const struct file_operations afs_dynroot_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = afs_dynroot_readdir, .fsync = noop_fsync, }; /* * Create an inode for a dynamic root directory. */ struct inode *afs_dynroot_iget_root(struct super_block *sb) { struct afs_super_info *as = AFS_FS_S(sb); struct afs_vnode *vnode; struct inode *inode; struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,}; if (as->volume) fid.vid = as->volume->vid; inode = iget5_locked(sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); if (!inode) return ERR_PTR(-ENOMEM); vnode = AFS_FS_I(inode); /* there shouldn't be an existing inode */ if (inode->i_state & I_NEW) { netfs_inode_init(&vnode->netfs, NULL, false); simple_inode_init_ts(inode); set_nlink(inode, 2); inode->i_size = 0; inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_dynroot_inode_operations; inode->i_fop = &afs_dynroot_file_operations; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_blocks = 0; inode->i_generation = 0; inode->i_flags |= S_NOATIME; set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); unlock_new_inode(inode); } _leave(" = %p", inode); return inode; }
16 16 16 15 15 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "rsrc.h" #include "nop.h" struct io_nop { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct file *file; int result; int fd; unsigned int flags; }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); nop->flags = READ_ONCE(sqe->nop_flags); if (nop->flags & ~NOP_FLAGS) return -EINVAL; if (nop->flags & IORING_NOP_INJECT_RESULT) nop->result = READ_ONCE(sqe->len); else nop->result = 0; if (nop->flags & IORING_NOP_FILE) nop->fd = READ_ONCE(sqe->fd); else nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) req->buf_index = READ_ONCE(sqe->buf_index); return 0; } int io_nop(struct io_kiocb *req, unsigned int issue_flags) { struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); int ret = nop->result; if (nop->flags & IORING_NOP_FILE) { if (nop->flags & IORING_NOP_FIXED_FILE) { req->file = io_file_get_fixed(req, nop->fd, issue_flags); req->flags |= REQ_F_FIXED_FILE; } else { req->file = io_file_get_normal(req, nop->fd); } if (!req->file) { ret = -EBADF; goto done; } } if (nop->flags & IORING_NOP_FIXED_BUFFER) { if (!io_find_buf_node(req, issue_flags)) ret = -EFAULT; } done: if (ret < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); return IOU_OK; }
1 1 2 1 1 1 297 1 295 3 2 4 4 5 5 307 6 301 5 295 5 291 17 15 2 7 8 8 17 13 8 10 3 3 1 1 10 3 11 17 15 17 8 2 16 10 3 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/pnode.c * * (C) Copyright IBM Corporation 2005. * Author : Ram Pai (linuxram@us.ibm.com) */ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/nsproxy.h> #include <uapi/linux/mount.h> #include "internal.h" #include "pnode.h" /* return the next shared peer mount of @p */ static inline struct mount *next_peer(struct mount *p) { return list_entry(p->mnt_share.next, struct mount, mnt_share); } static inline struct mount *first_slave(struct mount *p) { return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } static inline struct mount *last_slave(struct mount *p) { return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); } static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); } static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) { struct mount *m = mnt; do { /* Check the namespace first for optimization */ if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) return m; m = next_peer(m); } while (m != mnt); return NULL; } /* * Get ID of closest dominating peer group having a representative * under the given root. * * Caller must hold namespace_sem */ int get_dominating_id(struct mount *mnt, const struct path *root) { struct mount *m; for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); if (d) return d->mnt_group_id; } return 0; } static int do_make_slave(struct mount *mnt) { struct mount *master, *slave_mnt; if (list_empty(&mnt->mnt_share)) { if (IS_MNT_SHARED(mnt)) { mnt_release_group_id(mnt); CLEAR_MNT_SHARED(mnt); } master = mnt->mnt_master; if (!master) { struct list_head *p = &mnt->mnt_slave_list; while (!list_empty(p)) { slave_mnt = list_first_entry(p, struct mount, mnt_slave); list_del_init(&slave_mnt->mnt_slave); slave_mnt->mnt_master = NULL; } return 0; } } else { struct mount *m; /* * slave 'mnt' to a peer mount that has the * same root dentry. If none is available then * slave it to anything that is available. */ for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { if (m->mnt.mnt_root == mnt->mnt.mnt_root) { master = m; break; } } list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; CLEAR_MNT_SHARED(mnt); } list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); INIT_LIST_HEAD(&mnt->mnt_slave_list); mnt->mnt_master = master; return 0; } /* * vfsmount lock must be held for write */ void change_mnt_propagation(struct mount *mnt, int type) { if (type == MS_SHARED) { set_mnt_shared(mnt); return; } do_make_slave(mnt); if (type != MS_SLAVE) { list_del_init(&mnt->mnt_slave); mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) mnt->mnt.mnt_flags |= MNT_UNBINDABLE; else mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; } } /* * get the next mount in the propagation tree. * @m: the mount seen last * @origin: the original mount from where the tree walk initiated * * Note that peer groups form contiguous segments of slave lists. * We rely on that in get_source() to be able to find out if * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { struct mount *master = m->mnt_master; if (master == origin->mnt_master) { struct mount *next = next_peer(m); return (next == origin) ? NULL : next; } else if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); /* back at master */ m = master; } } static struct mount *skip_propagation_subtree(struct mount *m, struct mount *origin) { /* * Advance m such that propagation_next will not return * the slaves of m. */ if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) m = last_slave(m); return m; } static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { while (1) { struct mount *next; if (!IS_MNT_PROPAGATED(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { if (next == origin) return NULL; } else if (m->mnt_slave.next != &next->mnt_slave) break; m = next; } /* m is the last peer */ while (1) { struct mount *master = m->mnt_master; if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); m = next_peer(master); if (master->mnt_group_id == origin->mnt_group_id) break; if (master->mnt_slave.next == &m->mnt_slave) break; m = master; } if (m == origin) return NULL; } } /* all accesses are serialized by namespace_sem */ static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct hlist_head *list; static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } static int propagate_one(struct mount *m, struct mountpoint *dest_mp) { struct mount *child; int type; /* skip ones added by this propagate_mnt() */ if (IS_MNT_PROPAGATED(m)) return 0; /* skip if mountpoint isn't covered by it */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { struct mount *n, *p; bool done; for (n = m; ; n = p) { p = n->mnt_master; if (p == dest_master || IS_MNT_MARKED(p)) break; } do { struct mount *parent = last_source->mnt_parent; if (peers(last_source, first_source)) break; done = parent->mnt_master == p; if (done && peers(n, parent)) break; last_source = last_source->mnt_master; } while (!done); type = CL_SLAVE; /* beginning of peer group among the slaves? */ if (IS_MNT_SHARED(m)) type |= CL_MAKE_SHARED; } child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); read_seqlock_excl(&mount_lock); mnt_set_mountpoint(m, dest_mp, child); if (m->mnt_master != dest_master) SET_MNT_MARK(m->mnt_master); read_sequnlock_excl(&mount_lock); last_dest = m; last_source = child; hlist_add_head(&child->mnt_hash, list); return count_mounts(m->mnt_ns, child); } /* * mount 'source_mnt' under the destination 'dest_mnt' at * dentry 'dest_dentry'. And propagate that mount to * all the peer and slave mounts of 'dest_mnt'. * Link all the new mounts into a propagation tree headed at * source_mnt. Also link all the new mounts using ->mnt_list * headed at source_mnt's ->mnt_list * * @dest_mnt: destination mount. * @dest_dentry: destination dentry. * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { struct mount *m, *n; int ret = 0; /* * we don't want to bother passing tons of arguments to * propagate_one(); everything is serialized by namespace_sem, * so globals will do just fine. */ last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; list = tree_list; dest_master = dest_mnt->mnt_master; /* all peers of dest_mnt, except dest_mnt itself */ for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { ret = propagate_one(n, dest_mp); if (ret) goto out; } /* all slave groups */ for (m = next_group(dest_mnt, dest_mnt); m; m = next_group(m, dest_mnt)) { /* everything in that slave group */ n = m; do { ret = propagate_one(n, dest_mp); if (ret) goto out; n = next_peer(n); } while (n != m); } out: read_seqlock_excl(&mount_lock); hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent; if (m->mnt_master != dest_mnt->mnt_master) CLEAR_MNT_MARK(m->mnt_master); } read_sequnlock_excl(&mount_lock); return ret; } static struct mount *find_topper(struct mount *mnt) { /* If there is exactly one mount covering mnt completely return it. */ struct mount *child; if (!list_is_singular(&mnt->mnt_mounts)) return NULL; child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); if (child->mnt_mountpoint != mnt->mnt.mnt_root) return NULL; return child; } /* * return true if the refcount is greater than count */ static inline int do_refcount_check(struct mount *mnt, int count) { return mnt_get_count(mnt) > count; } /** * propagation_would_overmount - check whether propagation from @from * would overmount @to * @from: shared mount * @to: mount to check * @mp: future mountpoint of @to on @from * * If @from propagates mounts to @to, @from and @to must either be peers * or one of the masters in the hierarchy of masters of @to must be a * peer of @from. * * If the root of the @to mount is equal to the future mountpoint @mp of * the @to mount on @from then @to will be overmounted by whatever is * propagated to it. * * Context: This function expects namespace_lock() to be held and that * @mp is stable. * Return: If @from overmounts @to, true is returned, false if not. */ bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp) { if (!IS_MNT_SHARED(from)) return false; if (IS_MNT_PROPAGATED(to)) return false; if (to->mnt.mnt_root != mp->m_dentry) return false; for (const struct mount *m = to; m; m = m->mnt_master) { if (peers(from, m)) return true; } return false; } /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount * NOTE: unmounting 'mnt' would naturally propagate to all * other mounts its parent propagates to. * Check if any of these mounts that **do not have submounts** * have more references than 'refcnt'. If so return busy. * * vfsmount lock must be held for write */ int propagate_mount_busy(struct mount *mnt, int refcnt) { struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; if (mnt == parent) return do_refcount_check(mnt, refcnt); /* * quickly check if the current mount can be unmounted. * If not, we don't have to go checking for all other * mounts */ if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) return 1; for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { int count = 1; child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; /* Is there exactly one mount on the child that covers * it completely whose reference should be ignored? */ topper = find_topper(child); if (topper) count += 1; else if (!list_empty(&child->mnt_mounts)) continue; if (do_refcount_check(child, count)) return 1; } return 0; } /* * Clear MNT_LOCKED when it can be shown to be safe. * * mount_lock lock must be held for write */ void propagate_mount_unlock(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m, *child; BUG_ON(parent == mnt); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } } static void umount_one(struct mount *mnt, struct list_head *to_umount) { CLEAR_MNT_MARK(mnt); mnt->mnt.mnt_flags |= MNT_UMOUNT; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_umounting); move_from_ns(mnt, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ static bool __propagate_umount(struct mount *mnt, struct list_head *to_umount, struct list_head *to_restore) { bool progress = false; struct mount *child; /* * The state of the parent won't change if this mount is * already unmounted or marked as without children. */ if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) goto out; /* Verify topper is the only grandchild that has not been * speculatively unmounted. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (child->mnt_mountpoint == mnt->mnt.mnt_root) continue; if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) continue; /* Found a mounted child */ goto children; } /* Mark mounts that can be unmounted if not locked */ SET_MNT_MARK(mnt); progress = true; /* If a mount is without children and not locked umount it. */ if (!IS_MNT_LOCKED(mnt)) { umount_one(mnt, to_umount); } else { children: list_move_tail(&mnt->mnt_umounting, to_restore); } out: return progress; } static void umount_list(struct list_head *to_umount, struct list_head *to_restore) { struct mount *mnt, *child, *tmp; list_for_each_entry(mnt, to_umount, mnt_list) { list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { /* topper? */ if (child->mnt_mountpoint == mnt->mnt.mnt_root) list_move_tail(&child->mnt_umounting, to_restore); else umount_one(child, to_umount); } } } static void restore_mounts(struct list_head *to_restore) { /* Restore mounts to a clean working state */ while (!list_empty(to_restore)) { struct mount *mnt, *parent; struct mountpoint *mp; mnt = list_first_entry(to_restore, struct mount, mnt_umounting); CLEAR_MNT_MARK(mnt); list_del_init(&mnt->mnt_umounting); /* Should this mount be reparented? */ mp = mnt->mnt_mp; parent = mnt->mnt_parent; while (parent->mnt.mnt_flags & MNT_UMOUNT) { mp = parent->mnt_mp; parent = parent->mnt_parent; } if (parent != mnt->mnt_parent) { mnt_change_mountpoint(parent, mp, mnt); mnt_notify_add(mnt); } } } static void cleanup_umount_visitations(struct list_head *visited) { while (!list_empty(visited)) { struct mount *mnt = list_first_entry(visited, struct mount, mnt_umounting); list_del_init(&mnt->mnt_umounting); } } /* * collect all mounts that receive propagation from the mount in @list, * and return these additional mounts in the same list. * @list: the list of mounts to be unmounted. * * vfsmount lock must be held for write */ int propagate_umount(struct list_head *list) { struct mount *mnt; LIST_HEAD(to_restore); LIST_HEAD(to_umount); LIST_HEAD(visited); /* Find candidates for unmounting */ list_for_each_entry_reverse(mnt, list, mnt_list) { struct mount *parent = mnt->mnt_parent; struct mount *m; /* * If this mount has already been visited it is known that it's * entire peer group and all of their slaves in the propagation * tree for the mountpoint has already been visited and there is * no need to visit them again. */ if (!list_empty(&mnt->mnt_umounting)) continue; list_add_tail(&mnt->mnt_umounting, &visited); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; if (!list_empty(&child->mnt_umounting)) { /* * If the child has already been visited it is * know that it's entire peer group and all of * their slaves in the propgation tree for the * mountpoint has already been visited and there * is no need to visit this subtree again. */ m = skip_propagation_subtree(m, parent); continue; } else if (child->mnt.mnt_flags & MNT_UMOUNT) { /* * We have come across a partially unmounted * mount in a list that has not been visited * yet. Remember it has been visited and * continue about our merry way. */ list_add_tail(&child->mnt_umounting, &visited); continue; } /* Check the child and parents while progress is made */ while (__propagate_umount(child, &to_umount, &to_restore)) { /* Is the parent a umount candidate? */ child = child->mnt_parent; if (list_empty(&child->mnt_umounting)) break; } } } umount_list(&to_umount, &to_restore); restore_mounts(&to_restore); cleanup_umount_visitations(&visited); list_splice_tail(&to_umount, list); return 0; }
144 145 145 145 145 145 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to io context handling */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/sched/task.h> #include "blk.h" #include "blk-mq-sched.h" /* * For io context allocations */ static struct kmem_cache *iocontext_cachep; #ifdef CONFIG_BLK_ICQ /** * get_io_context - increment reference count to io_context * @ioc: io_context to get * * Increment reference count to @ioc. */ static void get_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); } /* * Exit an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_exit_icq(struct io_cq *icq) { struct elevator_type *et = icq->q->elevator->type; if (icq->flags & ICQ_EXITED) return; if (et->ops.exit_icq) et->ops.exit_icq(icq); icq->flags |= ICQ_EXITED; } static void ioc_exit_icqs(struct io_context *ioc) { struct io_cq *icq; spin_lock_irq(&ioc->lock); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) ioc_exit_icq(icq); spin_unlock_irq(&ioc->lock); } /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_destroy_icq(struct io_cq *icq) { struct io_context *ioc = icq->ioc; struct request_queue *q = icq->q; struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); lockdep_assert_held(&q->queue_lock); if (icq->flags & ICQ_DESTROYED) return; radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); list_del_init(&icq->q_node); /* * Both setting lookup hint to and clearing it from @icq are done * under queue_lock. If it's not pointing to @icq now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); ioc_exit_icq(icq); /* * @icq->q might have gone away by the time RCU callback runs * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; icq->flags |= ICQ_DESTROYED; kfree_rcu(icq, __rcu_head); } /* * Slow path for ioc release in put_io_context(). Performs double-lock * dancing to unlink all icq's and then frees ioc. */ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); spin_lock_irq(&ioc->lock); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, struct io_cq, ioc_node); struct request_queue *q = icq->q; if (spin_trylock(&q->queue_lock)) { ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { /* Make sure q and icq cannot be freed. */ rcu_read_lock(); /* Re-acquire the locks in the correct order. */ spin_unlock(&ioc->lock); spin_lock(&q->queue_lock); spin_lock(&ioc->lock); ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); rcu_read_unlock(); } } spin_unlock_irq(&ioc->lock); kmem_cache_free(iocontext_cachep, ioc); } /* * Releasing icqs requires reverse order double locking and we may already be * holding a queue_lock. Do it asynchronously from a workqueue. */ static bool ioc_delay_free(struct io_context *ioc) { unsigned long flags; spin_lock_irqsave(&ioc->lock, flags); if (!hlist_empty(&ioc->icq_list)) { queue_work(system_power_efficient_wq, &ioc->release_work); spin_unlock_irqrestore(&ioc->lock, flags); return true; } spin_unlock_irqrestore(&ioc->lock, flags); return false; } /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared * * Walk @q->icq_list and exit all io_cq's. */ void ioc_clear_queue(struct request_queue *q) { spin_lock_irq(&q->queue_lock); while (!list_empty(&q->icq_list)) { struct io_cq *icq = list_first_entry(&q->icq_list, struct io_cq, q_node); /* * Other context won't hold ioc lock to wait for queue_lock, see * details in ioc_release_fn(). */ spin_lock(&icq->ioc->lock); ioc_destroy_icq(icq); spin_unlock(&icq->ioc->lock); } spin_unlock_irq(&q->queue_lock); } #else /* CONFIG_BLK_ICQ */ static inline void ioc_exit_icqs(struct io_context *ioc) { } static inline bool ioc_delay_free(struct io_context *ioc) { return false; } #endif /* CONFIG_BLK_ICQ */ /** * put_io_context - put a reference of io_context * @ioc: io_context to put * * Decrement reference count of @ioc and release it if the count reaches * zero. */ void put_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc)) kmem_cache_free(iocontext_cachep, ioc); } EXPORT_SYMBOL_GPL(put_io_context); /* Called by the exiting task */ void exit_io_context(struct task_struct *task) { struct io_context *ioc; task_lock(task); ioc = task->io_context; task->io_context = NULL; task_unlock(task); if (atomic_dec_and_test(&ioc->active_ref)) { ioc_exit_icqs(ioc); put_io_context(ioc); } } static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ioc; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) return NULL; atomic_long_set(&ioc->refcount, 1); atomic_set(&ioc->active_ref, 1); #ifdef CONFIG_BLK_ICQ spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); #endif ioc->ioprio = IOPRIO_DEFAULT; return ioc; } int set_task_ioprio(struct task_struct *task, int ioprio) { int err; const struct cred *cred = current_cred(), *tcred; rcu_read_lock(); tcred = __task_cred(task); if (!uid_eq(tcred->uid, cred->euid) && !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); return -EPERM; } rcu_read_unlock(); err = security_task_setioprio(task, ioprio); if (err) return err; task_lock(task); if (unlikely(!task->io_context)) { struct io_context *ioc; task_unlock(task); ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE); if (!ioc) return -ENOMEM; task_lock(task); if (task->flags & PF_EXITING) { kmem_cache_free(iocontext_cachep, ioc); goto out; } if (task->io_context) kmem_cache_free(iocontext_cachep, ioc); else task->io_context = ioc; } task->io_context->ioprio = ioprio; out: task_unlock(task); return 0; } EXPORT_SYMBOL_GPL(set_task_ioprio); int __copy_io(unsigned long clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; /* * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { atomic_inc(&ioc->active_ref); tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE); if (!tsk->io_context) return -ENOMEM; tsk->io_context->ioprio = ioc->ioprio; } return 0; } #ifdef CONFIG_BLK_ICQ /** * ioc_lookup_icq - lookup io_cq from ioc * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ struct io_cq *ioc_lookup_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct io_cq *icq; lockdep_assert_held(&q->queue_lock); /* * icq's are indexed from @ioc using radix tree and hint pointer, * both of which are protected with RCU. All removals are done * holding both q and ioc locks, and we're holding q lock - if we * find a icq which points to us, it's guaranteed to be valid. */ rcu_read_lock(); icq = rcu_dereference(ioc->icq_hint); if (icq && icq->q == q) goto out; icq = radix_tree_lookup(&ioc->icq_tree, q->id); if (icq && icq->q == q) rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ else icq = NULL; out: rcu_read_unlock(); return icq; } EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq * @q: request_queue of interest * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. * * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ static struct io_cq *ioc_create_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO, q->node); if (!icq) return NULL; if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } icq->ioc = ioc; icq->q = q; INIT_LIST_HEAD(&icq->q_node); INIT_HLIST_NODE(&icq->ioc_node); /* lock both q and ioc and try to link @icq */ spin_lock_irq(&q->queue_lock); spin_lock(&ioc->lock); if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); if (et->ops.init_icq) et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } spin_unlock(&ioc->lock); spin_unlock_irq(&q->queue_lock); radix_tree_preload_end(); return icq; } struct io_cq *ioc_find_get_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct io_cq *icq = NULL; if (unlikely(!ioc)) { ioc = alloc_io_context(GFP_ATOMIC, q->node); if (!ioc) return NULL; task_lock(current); if (current->io_context) { kmem_cache_free(iocontext_cachep, ioc); ioc = current->io_context; } else { current->io_context = ioc; } get_io_context(ioc); task_unlock(current); } else { get_io_context(ioc); spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(q); spin_unlock_irq(&q->queue_lock); } if (!icq) { icq = ioc_create_icq(q); if (!icq) { put_io_context(ioc); return NULL; } } return icq; } EXPORT_SYMBOL_GPL(ioc_find_get_icq); #endif /* CONFIG_BLK_ICQ */ static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", sizeof(struct io_context), 0, SLAB_PANIC, NULL); return 0; } subsys_initcall(blk_ioc_init);
11 268 265 266 10 262 12 264 6 6 2 6 1 6 5 6 6 6 6 294 295 36 3515 3518 3516 3516 1379 264 1123 1126 1124 1126 1 1114 1117 130 1118 1134 1119 16 2517 2532 33 1 32 1 1119 1004 115 32 83 115 118 972 36 3638 3409 258 2906 19 2888 248 2662 2876 2 2876 2873 4 4 4 1 30 651 184 649 2 649 294 539 540 151 53 16 15 12 16 15 15 15 207 25 208 1 206 301 268 106 82 67 10 221 176 65 70 413 414 28 13 566 110 21 21 551 553 2 12 3 11 195 61 247 246 12 12 7 7 356 2336 745 745 95 15 15 15 15 15 15 15 15 3 12 15 11 11 11 10 2693 1 20 2682 2678 2677 2252 4 6 2 1 19 767 1463 1466 2207 119 119 360 3 1 1 1 117 238 239 971 18 959 3647 1293 2309 2304 530 531 494 2 2 2 12 12 12 12 11 11 11 11 15 1 1 4 8 2 12 11 11 1 3 7 11 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static bool super_flags(const struct super_block *sb, unsigned int flags) { /* * Pairs with smp_store_release() in super_wake() and ensures * that we see @flags after we're woken. */ return smp_load_acquire(&sb->s_flags) & flags; } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: The function returns true if SB_BORN was set and with * s_umount held. The function returns false if SB_DYING was * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); /* Don't pointlessly acquire s_umount. */ if (super_flags(sb, SB_DYING)) return false; __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) { super_unlock(sb, excl); return false; } WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); return true; } /* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); fsnotify_sb_free(s); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); for (int i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) goto fail; s->s_shrink->scan_objects = super_cache_scan; s->s_shrink->count_objects = super_cache_count; s->s_shrink->batch = 1024; s->s_shrink->private_data = s; if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super(struct super_block *sb) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_excl(sb); if (locked) { if (atomic_inc_not_zero(&sb->s_active)) { put_super(sb); return true; } super_unlock_excl(sb); } wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(sb); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL, "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; /* * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is * not set, as the filesystem is likely unprepared to handle it. * This can happen when fsconfig() is called from init_user_ns with * an fs_fd opened in another user namespace. */ if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); return ERR_PTR(-EPERM); } retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); shrinker_register(s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; /* We don't yet pass the user namespace of the parent * mount through to here so always use &init_user_ns * until that changes. */ if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); static void __iterate_supers(void (*f)(struct super_block *)) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); f(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { if (sb->s_root) f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool locked; sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); if (locked) { if (sb->s_root) f(sb, arg); super_unlock_shared(sb); } spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { bool locked; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ locked = super_lock(sb, excl); if (locked) { if (sb->s_root) return sb; super_unlock(sb, excl); } /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); break; } } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb) { bool locked = super_lock_excl(sb); if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } if (locked) super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb) { bool locked = super_lock_excl(sb); if (locked && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); return; } if (locked) super_unlock_excl(sb); } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock the superblock that is holder of the bdev. Returns the superblock * pointer if we successfully locked the superblock and it is alive. Otherwise * we return NULL and just unlock bdev->bd_holder_lock. * * The function must be called with bdev->bd_holder_lock and releases it. */ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); lockdep_assert_not_held(&bdev->bd_disk->open_mutex); /* Make sure sb doesn't go away from under us */ spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_holder_lock); locked = super_lock(sb, excl); /* * If the superblock wasn't already SB_DYING then we hold * s_umount and can safely drop our temporary reference. */ put_super(sb); if (!locked) return NULL; if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock(sb, excl); return NULL; } return sb; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); evict_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; sb = bdev_super_lock(bdev, false); if (!sb) return; sync_filesystem(sb); super_unlock_shared(sb); } static struct super_block *get_bdev_super(struct block_device *bdev) { bool active = false; struct super_block *sb; sb = bdev_super_lock(bdev, true); if (sb) { active = atomic_inc_not_zero(&sb->s_active); super_unlock_excl(sb); } if (!active) return NULL; return sb; } /** * fs_bdev_freeze - freeze owning filesystem of block device * @bdev: block device * * Freeze the filesystem that owns this block device if it is still * active. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ static int fs_bdev_freeze(struct block_device *bdev) { struct super_block *sb; int error = 0; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else error = freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); if (!error) error = sync_blockdev(bdev); deactivate_super(sb); return error; } /** * fs_bdev_thaw - thaw owning filesystem of block device * @bdev: block device * * Thaw the filesystem that owns this block device. * * A filesystem that owns multiple block devices may be frozen from each * block device and won't be unfrozen until all block devices are * unfrozen. Each block device can only freeze the filesystem once as we * nest freezes for block devices in the block layer. * * Return: If the thaw was successful zero is returned. If the thaw * failed a negative error code is returned. If this function * returns zero it doesn't mean that the filesystem is unfrozen * as it may have been frozen multiple times (kernel may hold a * freeze or might be frozen from other block devices). */ static int fs_bdev_thaw(struct block_device *bdev) { struct super_block *sb; int error; lockdep_assert_held(&bdev->bd_fsfreeze_mutex); /* * The block device may have been frozen before it was claimed by a * filesystem. Concurrently another process might try to mount that * frozen block device and has temporarily claimed the block device for * that purpose causing a concurrent fs_bdev_thaw() to end up here. The * mounter is already about to abort mounting because they still saw an * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return * NULL in that case. */ sb = get_bdev_super(bdev); if (!sb) return -EINVAL; if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else error = thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); deactivate_super(sb); return error; } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, .freeze = fs_bdev_freeze, .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct file *bdev_file; struct block_device *bdev; bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev_file); } bdev = file_bdev(bdev_file); /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { bdev_fput(bdev_file); return -EACCES; } /* * It is enough to check bdev was not frozen before we set * s_bdev as freezing will wait until SB_BORN is set. */ if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_fput(bdev_file); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev_file = bdev_file; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev_flags - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock * @flags: GET_TREE_BDEV_* flags */ int get_tree_bdev_flags(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), unsigned int flags) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP)) errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { error = setup_bdev_super(s, fc->sb_flags, fc); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL_GPL(get_tree_bdev_flags); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { return get_tree_bdev_flags(fc, fill_super, 0); } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { error = setup_bdev_super(s, flags, NULL); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); bdev_fput(sb->s_bdev_file); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n", fc->fs_type->name, error); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } #define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) #define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST) static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) ++sb->s_writers.freeze_kcount; if (who & FREEZE_HOLDER_USERSPACE) ++sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) --sb->s_writers.freeze_kcount; if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) --sb->s_writers.freeze_ucount; return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) { WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); if (who & FREEZE_HOLDER_KERNEL) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_kcount == 0; if (who & FREEZE_HOLDER_USERSPACE) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_ucount == 0; return false; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * A filesystem may hold multiple devices and thus a filesystems may be * frozen through the block layer via multiple block devices. In this * case the request is marked as being allowed to nest by passing * FREEZE_MAY_NEST. The filesystem remains frozen until all block * devices are unfrozen. If multiple freezes are attempted without * FREEZE_MAY_NEST -EBUSY will be returned. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. * * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while freezing!"); return -EINVAL; } atomic_inc(&sb->s_active); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (may_freeze(sb, who)) ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); else ret = -EBUSY; /* All freezers share a single active reference. */ deactivate_locked_super(sb); return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (sb_rdonly(sb)) { /* Nothing to do really... */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error = -EINVAL; if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) goto out_unlock; /* * All freezers share a single active reference. * So just unlock in case there are any left. */ if (freeze_dec(sb, who)) goto out_unlock; if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); goto out_deactivate; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { pr_err("VFS: Filesystem thaw failed\n"); freeze_inc(sb, who); lockdep_sb_freeze_release(sb); goto out_unlock; } } sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out_deactivate: deactivate_locked_super(sb); return 0; out_unlock: super_unlock_excl(sb); return error; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed * * A filesystem may hold multiple devices and thus a filesystems may * have been frozen through the block layer via multiple block devices. * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while thawing!"); return -EINVAL; } return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
9 269 217 52 264 75 339 339 339 339 339 338 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 269 268 60 60 27 36 4 56 4 60 60 60 60 214 5 208 208 209 209 274 65 164 5 209 208 209 209 208 271 3 4 4 4 4 270 4 274 60 27 70 274 92 273 274 274 5 274 2 1 1 270 2 270 270 270 358 356 358 357 274 349 354 354 354 353 354 354 6 5 291 2 1 77 211 288 288 284 1 282 1 9 2 7 373 373 373 2 272 303 351 351 2 327 272 349 33 282 282 282 282 282 282 282 281 282 282 282 281 282 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_icreate_item.h" #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_ag.h" #include "xfs_health.h" /* * Lookup a record by ino in the btree given by cur. */ int /* error */ xfs_inobt_lookup( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agino_t ino, /* starting inode of chunk */ xfs_lookup_t dir, /* <=, >=, == */ int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; cur->bc_rec.i.ir_holemask = 0; cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); } /* * Update the record referred to by cur to the value given. * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int /* error */ xfs_inobt_update( struct xfs_btree_cur *cur, /* btree cursor */ xfs_inobt_rec_incore_t *irec) /* btree record */ { union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); if (xfs_has_sparseinodes(cur->bc_mp)) { rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); rec.inobt.ir_u.sp.ir_count = irec->ir_count; rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; } else { /* ir_holemask/ir_count not supported on-disk */ rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } /* Convert on-disk btree record to incore inobt record. */ void xfs_inobt_btrec_to_irec( struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec) { irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); if (xfs_has_sparseinodes(mp)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; } else { /* * ir_holemask/ir_count not supported on-disk. Fill in hardcoded * values for full inode chunks. */ irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; irec->ir_count = XFS_INODES_PER_CHUNK; irec->ir_freecount = be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } /* Compute the freecount of an incore inode record. */ uint8_t xfs_inobt_rec_freecount( const struct xfs_inobt_rec_incore *irec) { uint64_t realfree = irec->ir_free; if (xfs_inobt_issparse(irec->ir_holemask)) realfree &= xfs_inobt_irec_to_allocmask(irec); return hweight64(realfree); } /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { /* Record has to be properly aligned within the AG. */ if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) return __this_address; if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; } static inline int xfs_inobt_complain_bad_rec( struct xfs_btree_cur *cur, xfs_failaddr_t fa, const struct xfs_inobt_rec_incore *irec) { struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, "%sbt record corruption in AG %d detected at %pS!", cur->bc_ops->name, cur->bc_group->xg_gno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, irec->ir_free, irec->ir_holemask); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } /* * Get the data from the pointed-to record. */ int xfs_inobt_get_rec( struct xfs_btree_cur *cur, struct xfs_inobt_rec_incore *irec, int *stat) { struct xfs_mount *mp = cur->bc_mp; union xfs_btree_rec *rec; xfs_failaddr_t fa; int error; error = xfs_btree_get_rec(cur, &rec, stat); if (error || *stat == 0) return error; xfs_inobt_btrec_to_irec(mp, rec, irec); fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); return 0; } /* * Insert a single inobt record. Cursor must already point to desired location. */ int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, uint16_t holemask, uint8_t count, int32_t freecount, xfs_inofree_t free, int *stat) { cur->bc_rec.i.ir_holemask = holemask; cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); } /* * Insert records describing a newly allocated inode chunk into the inobt. */ STATIC int xfs_inobt_insert( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t newino, xfs_agino_t newlen, bool is_finobt) { struct xfs_btree_cur *cur; xfs_agino_t thisino; int i; int error; if (is_finobt) cur = xfs_finobt_init_cursor(pag, tp, agbp); else cur = xfs_inobt_init_cursor(pag, tp, agbp); for (thisino = newino; thisino < newino + newlen; thisino += XFS_INODES_PER_CHUNK) { error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 0); error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, XFS_INODES_PER_CHUNK, XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 1); } xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; } /* * Verify that the number of free inodes in the AGI is correct. */ #ifdef DEBUG static int xfs_check_agi_freecount( struct xfs_btree_cur *cur) { if (cur->bc_nlevels == 1) { xfs_inobt_rec_incore_t rec; int freecount = 0; int error; int i; error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; do { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; if (i) { freecount += rec.ir_freecount; error = xfs_btree_increment(cur, 0, &i); if (error) return error; } } while (i == 1); if (!xfs_is_shutdown(cur->bc_mp)) { ASSERT(freecount == to_perag(cur->bc_group)->pagi_freecount); } } return 0; } #else #define xfs_check_agi_freecount(cur) 0 #endif /* * Initialise a new set of inodes. When called without a transaction context * (e.g. from recovery) we initiate a delayed write of the inode buffers rather * than logging them (which in a transaction context puts them into the AIL * for writeback rather than the xfsbufd queue). */ int xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen) { struct xfs_buf *fbuf; struct xfs_dinode *free; int nbufs; int version; int i, j; xfs_daddr_t d; xfs_ino_t ino = 0; int error; /* * Loop over the new block(s), filling in the inodes. For small block * sizes, manipulate the inodes in buffers which are multiples of the * blocks size. */ nbufs = length / M_IGEO(mp)->blocks_per_cluster; /* * Figure out what version number to use in the inodes we create. If * the superblock version has caught up to the one that supports the new * inode format, then use the new inode version. Otherwise use the old * version so that old kernels will continue to be able to use the file * system. * * For v3 inodes, we also need to write the inode number into the inode, * so calculate the first inode number of the chunk here as * XFS_AGB_TO_AGINO() only works within a filesystem block, not * across multiple filesystem blocks (such as a cluster) and so cannot * be used in the cluster buffer loop below. * * Further, because we are writing the inode directly into the buffer * and calculating a CRC on the entire inode, we have ot log the entire * inode so that the entire range the CRC covers is present in the log. * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ if (xfs_has_v3inodes(mp)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); /* * log the initialisation that is about to take place as an * logical operation. This means the transaction does not * need to log the physical changes to the inode buffers as log * recovery will know what initialisation is actually needed. * Hence we only need to log the buffers as "ordered" buffers so * they track in the AIL as if they were physically logged. */ if (tp) xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; for (j = 0; j < nbufs; j++) { /* * Get the block. */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, 0, &fbuf); if (error) return error; /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_version = version; free->di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); if (version == 3) { free->di_ino = cpu_to_be64(ino); ino++; uuid_copy(&free->di_uuid, &mp->m_sb.sb_meta_uuid); xfs_dinode_calc_crc(mp, free); } else if (tp) { /* just log the inode core */ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + XFS_DINODE_SIZE(mp) - 1); } } if (tp) { /* * Mark the buffer as an inode allocation buffer so it * sticks in AIL at the point of this allocation * transaction. This ensures the they are on disk before * the tail of the log can be moved past this * transaction (i.e. by preventing relogging from moving * it forward in the log). */ xfs_trans_inode_alloc_buf(tp, fbuf); if (version == 3) { /* * Mark the buffer as ordered so that they are * not physically logged in the transaction but * still tracked in the AIL as part of the * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); } } else { fbuf->b_flags |= XBF_DONE; xfs_buf_delwri_queue(fbuf, buffer_list); xfs_buf_relse(fbuf); } } return 0; } /* * Align startino and allocmask for a recently allocated sparse chunk such that * they are fit for insertion (or merge) into the on-disk inode btrees. * * Background: * * When enabled, sparse inode support increases the inode alignment from cluster * size to inode chunk size. This means that the minimum range between two * non-adjacent inode records in the inobt is large enough for a full inode * record. This allows for cluster sized, cluster aligned block allocation * without need to worry about whether the resulting inode record overlaps with * another record in the tree. Without this basic rule, we would have to deal * with the consequences of overlap by potentially undoing recent allocations in * the inode allocation codepath. * * Because of this alignment rule (which is enforced on mount), there are two * inobt possibilities for newly allocated sparse chunks. One is that the * aligned inode record for the chunk covers a range of inodes not already * covered in the inobt (i.e., it is safe to insert a new sparse record). The * other is that a record already exists at the aligned startino that considers * the newly allocated range as sparse. In the latter case, record content is * merged in hope that sparse inode chunks fill to full chunks over time. */ STATIC void xfs_align_sparse_ino( struct xfs_mount *mp, xfs_agino_t *startino, uint16_t *allocmask) { xfs_agblock_t agbno; xfs_agblock_t mod; int offset; agbno = XFS_AGINO_TO_AGBNO(mp, *startino); mod = agbno % mp->m_sb.sb_inoalignmt; if (!mod) return; /* calculate the inode offset and align startino */ offset = XFS_AGB_TO_AGINO(mp, mod); *startino -= offset; /* * Since startino has been aligned down, left shift allocmask such that * it continues to represent the same physical inodes relative to the * new startino. */ *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; } /* * Determine whether the source inode record can merge into the target. Both * records must be sparse, the inode ranges must match and there must be no * allocation overlap between the records. */ STATIC bool __xfs_inobt_can_merge( struct xfs_inobt_rec_incore *trec, /* tgt record */ struct xfs_inobt_rec_incore *srec) /* src record */ { uint64_t talloc; uint64_t salloc; /* records must cover the same inode range */ if (trec->ir_startino != srec->ir_startino) return false; /* both records must be sparse */ if (!xfs_inobt_issparse(trec->ir_holemask) || !xfs_inobt_issparse(srec->ir_holemask)) return false; /* both records must track some inodes */ if (!trec->ir_count || !srec->ir_count) return false; /* can't exceed capacity of a full record */ if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) return false; /* verify there is no allocation overlap */ talloc = xfs_inobt_irec_to_allocmask(trec); salloc = xfs_inobt_irec_to_allocmask(srec); if (talloc & salloc) return false; return true; } /* * Merge the source inode record into the target. The caller must call * __xfs_inobt_can_merge() to ensure the merge is valid. */ STATIC void __xfs_inobt_rec_merge( struct xfs_inobt_rec_incore *trec, /* target */ struct xfs_inobt_rec_incore *srec) /* src */ { ASSERT(trec->ir_startino == srec->ir_startino); /* combine the counts */ trec->ir_count += srec->ir_count; trec->ir_freecount += srec->ir_freecount; /* * Merge the holemask and free mask. For both fields, 0 bits refer to * allocated inodes. We combine the allocated ranges with bitwise AND. */ trec->ir_holemask &= srec->ir_holemask; trec->ir_free &= srec->ir_free; } /* * Insert a new sparse inode chunk into the associated inode allocation btree. * The inode record for the sparse chunk is pre-aligned to a startino that * should match any pre-existing sparse inode record in the tree. This allows * sparse chunks to fill over time. * * If no preexisting record exists, the provided record is inserted. * If there is a preexisting record, the provided record is merged with the * existing record and updated in place. The merged record is returned in nrec. * * It is considered corruption if a merge is requested and not possible. Given * the sparse inode alignment constraints, this should never happen. */ STATIC int xfs_inobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; struct xfs_inobt_rec_incore rec; cur = xfs_inobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; /* if nothing there, insert a new record and return */ if (i == 0) { error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, nrec->ir_count, nrec->ir_freecount, nrec->ir_free, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } goto out; } /* * A record exists at this startino. Merge the records. */ error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } /* * This should never fail. If we have coexisting records that * cannot merge, something is seriously wrong. */ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } trace_xfs_irec_merge_pre(pag, &rec, nrec); /* merge to nrec to output the updated record */ __xfs_inobt_rec_merge(nrec, &rec); trace_xfs_irec_merge_post(pag, nrec); error = xfs_inobt_rec_check_count(mp, nrec); if (error) goto error; error = xfs_inobt_update(cur, nrec); if (error) goto error; out: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Insert a new sparse inode chunk into the free inode btree. The inode * record for the sparse chunk is pre-aligned to a startino that should match * any pre-existing sparse inode record in the tree. This allows sparse chunks * to fill over time. * * The new record is always inserted, overwriting a pre-existing record if * there is one. */ STATIC int xfs_finobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ { struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; int error; int i; cur = xfs_finobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; /* if nothing there, insert a new record and return */ if (i == 0) { error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, nrec->ir_count, nrec->ir_freecount, nrec->ir_free, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } } else { error = xfs_inobt_update(cur, nrec); if (error) goto error; } xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Allocate new inodes in the allocation group specified by agbp. Returns 0 if * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so * the caller knows it can try another AG, a hard -ENOSPC when over the maximum * inode count threshold, or the usual negative error code for other errors. */ STATIC int xfs_ialloc_ag_alloc( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp) { struct xfs_agi *agi; struct xfs_alloc_arg args; int error; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe */ /* unit boundary */ /* init. to full chunk */ struct xfs_inobt_rec_incore rec; struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp); uint16_t allocmask = (uint16_t) -1; int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; args.fsbno = NULLFSBLOCK; args.oinfo = XFS_RMAP_OINFO_INODES; args.pag = pag; #ifdef DEBUG /* randomly do sparse inode allocations */ if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) do_sparse = get_random_u32_below(2); #endif /* * Locking will ensure that we don't have two callers in here * at one time. */ newlen = igeo->ialloc_inos; if (igeo->maxicount && percpu_counter_read_positive(&args.mp->m_icount) + newlen > igeo->maxicount) return -ENOSPC; args.minlen = args.maxlen = igeo->ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + igeo->ialloc_blks; if (do_sparse) goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.prod = 1; /* * We need to take into account alignment here to ensure that * we don't modify the free list if we fail to have an exact * block. If we don't have an exact match, and every oher * attempt allocation attempt fails, we'll end up cancelling * a dirty transaction and shutting down. * * For an exact allocation, alignment must be 1, * however we need to take cluster alignment into account when * fixing up the freelist. Use the minalignslop field to * indicate that extra blocks might be required for alignment, * but not to use them in the actual exact allocation. */ args.alignment = 1; args.minalignslop = igeo->cluster_align - 1; /* Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_exact_bno(&args, xfs_agbno_to_fsb(pag, args.agbno)); if (error) return error; /* * This request might have dirtied the transaction if the AG can * satisfy the request, but the exact block was not available. * If the allocation did fail, subsequent requests will relax * the exact agbno requirement and increase the alignment * instead. It is critical that the total size of the request * (len + alignment + slop) does not increase from this point * on, so reset minalignslop to ensure it is not included in * subsequent requests. */ args.minalignslop = 0; } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* * Set the alignment for the allocation. * If stripe alignment is turned on then align at stripe unit * boundary. * If the cluster size is smaller than a filesystem block * then we're doing I/O for inodes in filesystem block size * pieces, so don't need alignment anyway. */ isaligned = 0; if (igeo->ialloc_align) { ASSERT(!xfs_has_noalign(args.mp)); args.alignment = args.mp->m_dalign; isaligned = 1; } else args.alignment = igeo->cluster_align; /* * Allocate a fixed-size extent of inodes. */ args.prod = 1; /* * Allow space for the inode btree to split. */ args.minleft = igeo->inobt_maxlevels; error = xfs_alloc_vextent_near_bno(&args, xfs_agbno_to_fsb(pag, be32_to_cpu(agi->agi_root))); if (error) return error; } /* * If stripe alignment is turned on, then try again with cluster * alignment. */ if (isaligned && args.fsbno == NULLFSBLOCK) { args.alignment = igeo->cluster_align; error = xfs_alloc_vextent_near_bno(&args, xfs_agbno_to_fsb(pag, be32_to_cpu(agi->agi_root))); if (error) return error; } /* * Finally, try a sparse allocation if the filesystem supports it and * the sparse allocation length is smaller than a full chunk. */ if (xfs_has_sparseinodes(args.mp) && igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; args.minlen = igeo->ialloc_min_blks; args.maxlen = args.minlen; /* * The inode record will be aligned to full chunk size. We must * prevent sparse allocation from AG boundaries that result in * invalid inode records, such as records that start at agbno 0 * or extend beyond the AG. * * Set min agbno to the first aligned, non-zero agbno and max to * the last aligned agbno that is at least one full chunk from * the end of the AG. */ args.min_agbno = args.mp->m_sb.sb_inoalignmt; args.max_agbno = round_down(xfs_ag_block_count(args.mp, pag_agno(pag)), args.mp->m_sb.sb_inoalignmt) - igeo->ialloc_blks; error = xfs_alloc_vextent_near_bno(&args, xfs_agbno_to_fsb(pag, be32_to_cpu(agi->agi_root))); if (error) return error; newlen = XFS_AGB_TO_AGINO(args.mp, args.len); ASSERT(newlen <= XFS_INODES_PER_CHUNK); allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; } if (args.fsbno == NULLFSBLOCK) return -EAGAIN; ASSERT(args.len == args.minlen); /* * Stamp and write the inode buffers. * * Seed the new inode cluster with a random generation number. This * prevents short-term reuse of generation numbers if a chunk is * freed and then immediately reallocated. We use random numbers * rather than a linear progression to prevent the next generation * number from being easily guessable. */ error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag), args.agbno, args.len, get_random_u32()); if (error) return error; /* * Convert the results. */ newino = XFS_AGB_TO_AGINO(args.mp, args.agbno); if (xfs_inobt_issparse(~allocmask)) { /* * We've allocated a sparse chunk. Align the startino and mask. */ xfs_align_sparse_ino(args.mp, &newino, &allocmask); rec.ir_startino = newino; rec.ir_holemask = ~allocmask; rec.ir_count = newlen; rec.ir_freecount = newlen; rec.ir_free = XFS_INOBT_ALL_FREE; /* * Insert the sparse record into the inobt and allow for a merge * if necessary. If a merge does occur, rec is updated to the * merged record. */ error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", xfs_agino_to_ino(pag, rec.ir_startino), rec.ir_holemask, rec.ir_count); xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); } if (error) return error; /* * We can't merge the part we've just allocated as for the inobt * due to finobt semantics. The original record may or may not * exist independent of whether physical inodes exist in this * sparse chunk. * * We must update the finobt record based on the inobt record. * rec contains the fully merged and up to date inobt record * from the previous call. Set merge false to replace any * existing record with this one. */ if (xfs_has_finobt(args.mp)) { error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec); if (error) return error; } } else { /* full chunk - insert new records to both btrees */ error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false); if (error) return error; if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, true); if (error) return error; } } /* * Update AGI counts and newino. */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag->pagi_freecount += newlen; pag->pagi_count += newlen; agi->agi_newino = cpu_to_be32(newino); /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO); /* * Modify/log superblock values for inode count and inode free count. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen); return 0; } /* * Try to retrieve the next record to the left/right from the current one. */ STATIC int xfs_ialloc_next_rec( struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *done, int left) { int error; int i; if (left) error = xfs_btree_decrement(cur, 0, &i); else error = xfs_btree_increment(cur, 0, &i); if (error) return error; *done = !i; if (i) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } } return 0; } STATIC int xfs_ialloc_get_rec( struct xfs_btree_cur *cur, xfs_agino_t agino, xfs_inobt_rec_incore_t *rec, int *done) { int error; int i; error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); if (error) return error; *done = !i; if (i) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } } return 0; } /* * Return the offset of the first free inode in the record. If the inode chunk * is sparsely allocated, we convert the record holemask to inode granularity * and mask off the unallocated regions from the inode free mask. */ STATIC int xfs_inobt_first_free_inode( struct xfs_inobt_rec_incore *rec) { xfs_inofree_t realfree; /* if there are no holes, return the first available offset */ if (!xfs_inobt_issparse(rec->ir_holemask)) return xfs_lowbit64(rec->ir_free); realfree = xfs_inobt_irec_to_allocmask(rec); realfree &= rec->ir_free; return xfs_lowbit64(realfree); } /* * If this AG has corrupt inodes, check if allocating this inode would fail * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again * somewhere else. */ static int xfs_dialloc_check_ino( struct xfs_perag *pag, struct xfs_trans *tp, xfs_ino_t ino) { struct xfs_imap imap; struct xfs_buf *bp; int error; error = xfs_imap(pag, tp, ino, &imap, 0); if (error) return -EAGAIN; error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp); if (error) return -EAGAIN; xfs_trans_brelse(tp, bp); return 0; } /* * Allocate an inode using the inobt-only algorithm. */ STATIC int xfs_dialloc_ag_inobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur, *tcur; struct xfs_inobt_rec_incore rec, trec; xfs_ino_t ino; int error; int offset; int i, j; int searchdistance = 10; ASSERT(xfs_perag_initialised_agi(pag)); ASSERT(xfs_perag_allows_inodes(pag)); ASSERT(pag->pagi_freecount > 0); restart_pagno: cur = xfs_inobt_init_cursor(pag, tp, agbp); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * If in the same AG as the parent, try to get near the parent. */ if (pagno == pag_agno(pag)) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; if (XFS_IS_CORRUPT(mp, j != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } if (rec.ir_freecount > 0) { /* * Found a free inode in the same chunk * as the parent, done. */ goto alloc_inode; } /* * In the same AG as parent, but parent's chunk is full. */ /* duplicate the cursor, search left & right simultaneously */ error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; /* * Skip to last blocks looked up if same parent inode. */ if (pagino != NULLAGINO && pag->pagl_pagino == pagino && pag->pagl_leftrec != NULLAGINO && pag->pagl_rightrec != NULLAGINO) { error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, &trec, &doneleft); if (error) goto error1; error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, &rec, &doneright); if (error) goto error1; } else { /* search left with tcur, back up 1 record */ error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); if (error) goto error1; /* search right with cur, go forward 1 record. */ error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); if (error) goto error1; } /* * Loop until we find an inode chunk with a free inode. */ while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < rec.ir_startino - pagino; } else { useleft = !doneleft; } /* free inodes to the left? */ if (useleft && trec.ir_freecount) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; rec = trec; goto alloc_inode; } /* free inodes to the right? */ if (!useleft && rec.ir_freecount) { xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; goto alloc_inode; } /* get next record to check */ if (useleft) { error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); } else { error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); } if (error) goto error1; } if (searchdistance <= 0) { /* * Not in range - save last search * location and allocate a new inode */ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; } else { /* * We've reached the end of the btree. because * we are only searching a small chunk of the * btree each search, there is obviously free * inodes closer to the parent inode than we * are now. restart the search again. */ pag->pagl_pagino = NULLAGINO; pag->pagl_leftrec = NULLAGINO; pag->pagl_rightrec = NULLAGINO; xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); goto restart_pagno; } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) goto error0; if (i == 1) { error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; if (j == 1 && rec.ir_freecount > 0) { /* * The last chunk allocated in the group * still has a free inode. */ goto alloc_inode; } } } /* * None left in the last group, search the whole AG */ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } } alloc_inode: offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); if (error) goto error0; } rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; error = xfs_inobt_update(cur, &rec); if (error) goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; error = xfs_check_agi_freecount(cur); if (error) goto error0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Use the free inode btree to allocate an inode based on distance from the * parent. Note that the provided cursor may be deleted and replaced. */ STATIC int xfs_dialloc_ag_finobt_near( xfs_agino_t pagino, struct xfs_btree_cur **ocur, struct xfs_inobt_rec_incore *rec) { struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ struct xfs_btree_cur *rcur; /* right search cursor */ struct xfs_inobt_rec_incore rrec; int error; int i, j; error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); if (error) return error; if (i == 1) { error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) { xfs_btree_mark_sick(lcur); return -EFSCORRUPTED; } /* * See if we've landed in the parent inode record. The finobt * only tracks chunks with at least one free inode, so record * existence is enough. */ if (pagino >= rec->ir_startino && pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) return 0; } error = xfs_btree_dup_cursor(lcur, &rcur); if (error) return error; error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); if (error) goto error_rcur; if (j == 1) { error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } } if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer * inode chunk to the target. */ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > (rrec.ir_startino - pagino)) { *rec = rrec; xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); *ocur = rcur; } else { xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); } } else if (j == 1) { /* only the right record is valid */ *rec = rrec; xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); *ocur = rcur; } else if (i == 1) { /* only the left record is valid */ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); } return 0; error_rcur: xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); return error; } /* * Use the free inode btree to find a free inode based on a newino hint. If * the hint is NULL, find the first free inode in the AG. */ STATIC int xfs_dialloc_ag_finobt_newino( struct xfs_agi *agi, struct xfs_btree_cur *cur, struct xfs_inobt_rec_incore *rec) { int error; int i; if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) return error; if (i == 1) { error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } } /* * Find the first inode available in the AG. */ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } /* * Update the inobt based on a modification made to the finobt. Also ensure that * the records from both trees are equivalent post-modification. */ STATIC int xfs_dialloc_ag_update_inobt( struct xfs_btree_cur *cur, /* inobt cursor */ struct xfs_inobt_rec_incore *frec, /* finobt record */ int offset) /* inode offset */ { struct xfs_inobt_rec_incore rec; int error; int i; error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if (XFS_IS_CORRUPT(cur->bc_mp, rec.ir_free != frec->ir_free || rec.ir_freecount != frec->ir_freecount)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return xfs_inobt_update(cur, &rec); } /* * Allocate an inode using the free inode btree, if available. Otherwise, fall * back to the inobt search algorithm. * * The caller selected an AG for us, and made sure that free inodes are * available. */ static int xfs_dialloc_ag( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_ino_t parent, xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); struct xfs_btree_cur *cur; /* finobt cursor */ struct xfs_btree_cur *icur; /* inobt cursor */ struct xfs_inobt_rec_incore rec; xfs_ino_t ino; int error; int offset; int i; if (!xfs_has_finobt(mp)) return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) goto error_cur; /* * The search algorithm depends on whether we're in the same AG as the * parent. If so, find the closest available inode to the parent. If * not, consider the agi hint or find the first free inode in the AG. */ if (pag_agno(pag) == pagno) error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); else error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); if (error) goto error_cur; offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = xfs_agino_to_ino(pag, rec.ir_startino + offset); if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) { error = xfs_dialloc_check_ino(pag, tp, ino); if (error) goto error_cur; } /* * Modify or remove the finobt record. */ rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; if (rec.ir_freecount) error = xfs_inobt_update(cur, &rec); else error = xfs_btree_delete(cur, &i); if (error) goto error_cur; /* * The finobt has now been updated appropriately. We haven't updated the * agi and superblock yet, so we can create an inobt cursor and validate * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ icur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(icur); if (error) goto error_icur; error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); if (error) goto error_icur; /* * Both trees have now been updated. We must update the perag and * superblock before we can check the freecount for each btree. */ be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount--; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); error = xfs_check_agi_freecount(icur); if (error) goto error_icur; error = xfs_check_agi_freecount(cur); if (error) goto error_icur; xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); *inop = ino; return 0; error_icur: xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); error_cur: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } static int xfs_dialloc_roll( struct xfs_trans **tpp, struct xfs_buf *agibp) { struct xfs_trans *tp = *tpp; struct xfs_dquot_acct *dqinfo; int error; /* * Hold to on to the agibp across the commit so no other allocation can * come in and take the free inodes we just allocated for our caller. */ xfs_trans_bhold(tp, agibp); /* * We want the quota changes to be associated with the next transaction, * NOT this one. So, detach the dqinfo from this and attach it to the * next transaction. */ dqinfo = tp->t_dqinfo; tp->t_dqinfo = NULL; error = xfs_trans_roll(&tp); /* Re-attach the quota info that we detached from prev trx. */ tp->t_dqinfo = dqinfo; /* * Join the buffer even on commit error so that the buffer is released * when the caller cancels the transaction and doesn't have to handle * this error case specially. */ xfs_trans_bjoin(tp, agibp); *tpp = tp; return error; } static bool xfs_dialloc_good_ag( struct xfs_perag *pag, struct xfs_trans *tp, umode_t mode, int flags, bool ok_alloc) { struct xfs_mount *mp = tp->t_mountp; xfs_extlen_t ineed; xfs_extlen_t longest = 0; int needspace; int error; if (!pag) return false; if (!xfs_perag_allows_inodes(pag)) return false; if (!xfs_perag_initialised_agi(pag)) { error = xfs_ialloc_read_agi(pag, tp, 0, NULL); if (error) return false; } if (pag->pagi_freecount) return true; if (!ok_alloc) return false; if (!xfs_perag_initialised_agf(pag)) { error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; } /* * Check that there is enough free space for the file plus a chunk of * inodes if we need to allocate some. If this is the first pass across * the AGs, take into account the potential space needed for alignment * of inode chunks when checking the longest contiguous free space in * the AG - this prevents us from getting ENOSPC because we have free * space larger than ialloc_blks but alignment constraints prevent us * from using it. * * If we can't find an AG with space for full alignment slack to be * taken into account, we must be near ENOSPC in all AGs. Hence we * don't include alignment for the second pass and so if we fail * allocation due to alignment issues then it is most likely a real * ENOSPC condition. * * XXX(dgc): this calculation is now bogus thanks to the per-ag * reservations that xfs_alloc_fix_freelist() now does via * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will * be more than large enough for the check below to succeed, but * xfs_alloc_space_available() will fail because of the non-zero * metadata reservation and hence we won't actually be able to allocate * more inodes in this AG. We do soooo much unnecessary work near ENOSPC * because of this. */ ineed = M_IGEO(mp)->ialloc_min_blks; if (flags && ineed > 1) ineed += M_IGEO(mp)->cluster_align; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode); if (pag->pagf_freeblks < needspace + ineed || longest < ineed) return false; return true; } static int xfs_dialloc_try_ag( struct xfs_perag *pag, struct xfs_trans **tpp, xfs_ino_t parent, xfs_ino_t *new_ino, bool ok_alloc) { struct xfs_buf *agbp; xfs_ino_t ino; int error; /* * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp); if (error) return error; if (!pag->pagi_freecount) { if (!ok_alloc) { error = -EAGAIN; goto out_release; } error = xfs_ialloc_ag_alloc(pag, *tpp, agbp); if (error < 0) goto out_release; /* * We successfully allocated space for an inode cluster in this * AG. Roll the transaction so that we can allocate one of the * new inodes. */ ASSERT(pag->pagi_freecount > 0); error = xfs_dialloc_roll(tpp, agbp); if (error) goto out_release; } /* Allocate an inode in the found AG */ error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino); if (!error) *new_ino = ino; return error; out_release: xfs_trans_brelse(*tpp, agbp); return error; } /* * Pick an AG for the new inode. * * Directories, symlinks, and regular files frequently allocate at least one * block, so factor that potential expansion when we examine whether an AG has * enough space for file creation. Try to keep metadata files all in the same * AG. */ static inline xfs_agnumber_t xfs_dialloc_pick_ag( struct xfs_mount *mp, struct xfs_inode *dp, umode_t mode) { xfs_agnumber_t start_agno; if (!dp) return 0; if (xfs_is_metadir_inode(dp)) { if (mp->m_sb.sb_logstart) return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); return 0; } if (S_ISDIR(mode)) return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi; start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino); if (start_agno >= mp->m_maxagi) start_agno = 0; return start_agno; } /* * Allocate an on-disk inode. * * Mode is used to tell whether the new inode is a directory and hence where to * locate it. The on-disk inode that is allocated will be returned in @new_ino * on success, otherwise an error will be set to indicate the failure (e.g. * -ENOSPC). */ int xfs_dialloc( struct xfs_trans **tpp, const struct xfs_icreate_args *args, xfs_ino_t *new_ino) { struct xfs_mount *mp = (*tpp)->t_mountp; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_ino_t ino = NULLFSINO; xfs_ino_t parent = args->pip ? args->pip->i_ino : 0; xfs_agnumber_t agno; xfs_agnumber_t start_agno; umode_t mode = args->mode & S_IFMT; bool ok_alloc = true; bool low_space = false; int flags; int error = 0; start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode); /* * If we have already hit the ceiling of inode blocks then clear * ok_alloc so we scan all available agi structures for a free * inode. * * Read rough value of mp->m_icount by percpu_counter_read_positive, * which will sacrifice the preciseness but improve the performance. */ if (igeo->maxicount && percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos > igeo->maxicount) { ok_alloc = false; } /* * If we are near to ENOSPC, we want to prefer allocation from AGs that * have free inodes in them rather than use up free space allocating new * inode chunks. Hence we turn off allocation for the first non-blocking * pass through the AGs if we are near ENOSPC to consume free inodes * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; } /* * Loop until we find an allocation group that either has free inodes * or in which we can allocate some inodes. Iterate through the * allocation groups upward, wrapping at the end. */ flags = XFS_ALLOC_FLAG_TRYLOCK; retry: for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) { if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) { error = xfs_dialloc_try_ag(pag, tpp, parent, &ino, ok_alloc); if (error != -EAGAIN) break; error = 0; } if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } } if (pag) xfs_perag_rele(pag); if (error) return error; if (ino == NULLFSINO) { if (flags) { flags = 0; if (low_space) ok_alloc = true; goto retry; } return -ENOSPC; } /* * Protect against obviously corrupt allocation btree records. Later * xfs_iget checks will catch re-allocation of other active in-memory * and on-disk inodes. If we don't catch reallocating the parent inode * here we will deadlock in xfs_iget() so we have to do these checks * first. */ if (ino == parent || !xfs_verify_dir_ino(mp, ino)) { xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } *new_ino = ino; return 0; } /* * Free the blocks of an inode chunk. We must consider that the inode chunk * might be sparse and only free the regions that are allocated as part of the * chunk. */ static int xfs_difree_inode_chunk( struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inobt_rec_incore *rec) { struct xfs_mount *mp = tp->t_mountp; xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); int startidx, endidx; int nextbit; xfs_agblock_t agbno; int contigblk; DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); } /* holemask is only 16-bits (fits in an unsigned long) */ ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); holemask[0] = rec->ir_holemask; /* * Find contiguous ranges of zeroes (i.e., allocated regions) in the * holemask and convert the start/end index of each range to an extent. * We start with the start and end index both pointing at the first 0 in * the mask. */ startidx = endidx = find_first_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { int error; nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* * If the next zero bit is contiguous, update the end index of * the current range and continue. */ if (nextbit != XFS_INOBT_HOLEMASK_BITS && nextbit == endidx + 1) { endidx = nextbit; goto next; } /* * nextbit is not contiguous with the current end index. Convert * the current start/end to an extent and add it to the free * list. */ agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / mp->m_sb.sb_inopblock; contigblk = ((endidx - startidx + 1) * XFS_INODES_PER_HOLEMASK_BIT) / mp->m_sb.sb_inopblock; ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno), contigblk, &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, 0); if (error) return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; next: nextbit++; } return 0; } STATIC int xfs_difree_inobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_mount *mp = pag_mount(pag); struct xfs_agi *agi = agbp->b_addr; struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int ilen; int error; int i; int off; ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); /* * Initialize the cursor. */ cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) goto error0; /* * Look for the entry describing this inode. */ if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.", __func__, error); goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } /* * Get the offset in the inode chunk. */ off = agino - rec.ir_startino; ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); /* * Mark the inode free & increment the count. */ rec.ir_free |= XFS_INOBT_MASK(off); rec.ir_freecount++; /* * When an inode chunk is free, it becomes eligible for removal. Don't * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); pag->pagi_freecount -= ilen - 1; pag->pagi_count -= ilen; xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); if ((error = xfs_btree_delete(cur, &i))) { xfs_warn(mp, "%s: xfs_btree_delete returned error %d.", __func__, error); goto error0; } error = xfs_difree_inode_chunk(tp, pag, &rec); if (error) goto error0; } else { xic->deleted = false; error = xfs_inobt_update(cur, &rec); if (error) { xfs_warn(mp, "%s: xfs_inobt_update returned error %d.", __func__, error); goto error0; } /* * Change the inode free counts and log the ag/sb changes. */ be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); pag->pagi_freecount++; xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } error = xfs_check_agi_freecount(cur); if (error) goto error0; *orec = rec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Free an inode in the free inode btree. */ STATIC int xfs_difree_finobt( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { struct xfs_mount *mp = pag_mount(pag); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; int offset = agino - ibtrec->ir_startino; int error; int i; cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) goto error; if (i == 0) { /* * If the record does not exist in the finobt, we must have just * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, ibtrec->ir_count, ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; ASSERT(i == 1); goto out; } /* * Read and update the existing record. We could just copy the ibtrec * across here, but that would defeat the purpose of having redundant * metadata. By making the modifications independently, we can catch * corruptions that we wouldn't see if we just copied from one record * to another. */ error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; if (XFS_IS_CORRUPT(mp, rec.ir_free != ibtrec->ir_free || rec.ir_freecount != ibtrec->ir_freecount)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } /* * The content of inobt records should always match between the inobt * and finobt. The lifecycle of records in the finobt is different from * the inobt in that the finobt only tracks records with at least one * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. * * Note that we currently can't free chunks when the block size is large * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) goto error; ASSERT(i == 1); } else { error = xfs_inobt_update(cur, &rec); if (error) goto error; } out: error = xfs_check_agi_freecount(cur); if (error) goto error; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; error: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } /* * Free disk inode. Carefully avoids touching the incore inode, all * manipulations incore are the caller's responsibility. * The on-disk inode is not changed by this operation, only the * btree (free inode mask) is changed. */ int xfs_difree( struct xfs_trans *tp, struct xfs_perag *pag, xfs_ino_t inode, struct xfs_icluster *xic) { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ struct xfs_buf *agbp; /* buffer for allocation group header */ xfs_agino_t agino; /* allocation group inode number */ int error; /* error return value */ struct xfs_mount *mp = tp->t_mountp; struct xfs_inobt_rec_incore rec;/* btree record */ /* * Break up inode number into its components. */ if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) { xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).", __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag)); ASSERT(0); return -EINVAL; } agino = XFS_INO_TO_AGINO(mp, inode); if (inode != xfs_agino_to_ino(pag, agino)) { xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).", __func__, (unsigned long long)inode, (unsigned long long)xfs_agino_to_ino(pag, agino)); ASSERT(0); return -EINVAL; } agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).", __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag))); ASSERT(0); return -EINVAL; } /* * Get the allocation group header. */ error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); return error; } /* * Fix up the inode allocation btree. */ error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec); if (error) goto error0; /* * Fix up the free inode btree. */ if (xfs_has_finobt(mp)) { error = xfs_difree_finobt(pag, tp, agbp, agino, &rec); if (error) goto error0; } return 0; error0: return error; } STATIC int xfs_imap_lookup( struct xfs_perag *pag, struct xfs_trans *tp, xfs_agino_t agino, xfs_agblock_t agbno, xfs_agblock_t *chunk_agbno, xfs_agblock_t *offset_agbno, int flags) { struct xfs_mount *mp = pag_mount(pag); struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; int i; error = xfs_ialloc_read_agi(pag, tp, 0, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", __func__, error, pag_agno(pag)); return error; } /* * Lookup the inode record for the given agino. If the record cannot be * found, then it's an invalid inode number and we should abort. Once * we have a record, we need to ensure it contains the inode number * we are looking up. */ cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); if (!error && i == 0) error = -EINVAL; } xfs_trans_brelse(tp, agbp); xfs_btree_del_cursor(cur, error); if (error) return error; /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino) return -EINVAL; /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) return -EINVAL; *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); *offset_agbno = agbno - *chunk_agbno; return 0; } /* * Return the location of the inode in imap, for mapping it into a buffer. */ int xfs_imap( struct xfs_perag *pag, struct xfs_trans *tp, xfs_ino_t ino, /* inode to locate */ struct xfs_imap *imap, /* location map structure */ uint flags) /* flags for inode btree lookup */ { struct xfs_mount *mp = pag_mount(pag); xfs_agblock_t agbno; /* block number of inode in the alloc group */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ xfs_agblock_t cluster_agbno; /* first block in inode cluster */ int error; /* error code */ int offset; /* index of inode in its buffer */ xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ ASSERT(ino != NULLFSINO); /* * Split up the inode number into its parts. */ agino = XFS_INO_TO_AGINO(mp, ino); agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) || ino != xfs_agino_to_ino(pag, agino)) { error = -EINVAL; #ifdef DEBUG /* * Don't output diagnostic information for untrusted inodes * as they can be invalid without implying corruption. */ if (flags & XFS_IGET_UNTRUSTED) return error; if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) { xfs_alert(mp, "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", __func__, (unsigned long long)agbno, (unsigned long)xfs_ag_block_count(mp, pag_agno(pag))); } if (ino != xfs_agino_to_ino(pag, agino)) { xfs_alert(mp, "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)", __func__, ino, xfs_agino_to_ino(pag, agino)); } xfs_stack_trace(); #endif /* DEBUG */ return error; } /* * For bulkstat and handle lookups, we have an untrusted inode number * that we have to verify is valid. We cannot do this just by reading * the inode buffer as it may have been unlinked and removed leaving * inodes in stale state on disk. Hence we have to do a btree lookup * in all cases where an untrusted inode number is passed. */ if (flags & XFS_IGET_UNTRUSTED) { error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) return error; goto out_map; } /* * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ if (M_IGEO(mp)->blocks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); imap->im_blkno = xfs_agbno_to_daddr(pag, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); return 0; } /* * If the inode chunks are aligned then use simple maths to * find the location. Otherwise we have to do a btree * lookup to find the location. */ if (M_IGEO(mp)->inoalign_mask) { offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { error = xfs_imap_lookup(pag, tp, agino, agbno, &chunk_agbno, &offset_agbno, flags); if (error) return error; } out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) * M_IGEO(mp)->blocks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds * of the file system then return NULL rather than calling * read_buf and panicing when we get an error from the * driver. */ if ((imap->im_blkno + imap->im_len) > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { xfs_alert(mp, "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)", __func__, (unsigned long long) imap->im_blkno, (unsigned long long) imap->im_len, XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); return -EINVAL; } return 0; } /* * Log specified fields for the ag hdr (inode section). The growth of the agi * structure over time requires that we interpret the buffer as two logical * regions delineated by the end of the unlinked list. This is due to the size * of the hash table and its location in the middle of the agi. * * For example, a request to log a field before agi_unlinked and a field after * agi_unlinked could cause us to log the entire hash table and use an excessive * amount of log space. To avoid this behavior, log the region up through * agi_unlinked in one call and the region after agi_unlinked through the end of * the structure in another. */ void xfs_ialloc_log_agi( struct xfs_trans *tp, struct xfs_buf *bp, uint32_t fields) { int first; /* first byte number */ int last; /* last byte number */ static const short offsets[] = { /* field starting offsets */ /* keep in sync with bit definitions */ offsetof(xfs_agi_t, agi_magicnum), offsetof(xfs_agi_t, agi_versionnum), offsetof(xfs_agi_t, agi_seqno), offsetof(xfs_agi_t, agi_length), offsetof(xfs_agi_t, agi_count), offsetof(xfs_agi_t, agi_root), offsetof(xfs_agi_t, agi_level), offsetof(xfs_agi_t, agi_freecount), offsetof(xfs_agi_t, agi_newino), offsetof(xfs_agi_t, agi_dirino), offsetof(xfs_agi_t, agi_unlinked), offsetof(xfs_agi_t, agi_free_root), offsetof(xfs_agi_t, agi_free_level), offsetof(xfs_agi_t, agi_iblocks), sizeof(xfs_agi_t) }; #ifdef DEBUG struct xfs_agi *agi = bp->b_addr; ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through * agi_unlinked. */ if (fields & XFS_AGI_ALL_BITS_R1) { xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, &first, &last); xfs_trans_log_buf(tp, bp, first, last); } /* * Mask off the bits in the first region and calculate the first and * last field offsets for any bits in the second region. */ fields &= ~XFS_AGI_ALL_BITS_R1; if (fields) { xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, &first, &last); xfs_trans_log_buf(tp, bp, first, last); } } static xfs_failaddr_t xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; uint32_t agi_seqno = be32_to_cpu(agi->agi_seqno); uint32_t agi_length = be32_to_cpu(agi->agi_length); int i; if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } /* * Validate the magic number of the agi block. */ if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; fa = xfs_validate_ag_length(bp, agi_seqno, agi_length); if (fa) return fa; if (be32_to_cpu(agi->agi_level) < 1 || be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; if (xfs_has_finobt(mp) && (be32_to_cpu(agi->agi_free_level) < 1 || be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO)) continue; if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i]))) return __this_address; } return NULL; } static void xfs_agi_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void xfs_agi_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } if (!xfs_has_crc(mp)) return; if (bip) agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, }; /* * Read in the allocation group header (inode allocation section) */ int xfs_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, xfs_buf_flags_t flags, struct xfs_buf **agibpp) { struct xfs_mount *mp = pag_mount(pag); int error; trace_xfs_read_agi(pag); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops); if (xfs_metadata_is_sick(error)) xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) return error; if (tp) xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*agibpp, XFS_AGI_REF); return 0; } /* * Read in the agi and initialise the per-ag data. If the caller supplies a * @agibpp, return the locked AGI buffer to them, otherwise release it. */ int xfs_ialloc_read_agi( struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agibpp) { struct xfs_buf *agibp; struct xfs_agi *agi; int error; trace_xfs_ialloc_read_agi(pag); error = xfs_read_agi(pag, tp, (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, &agibp); if (error) return error; agi = agibp->b_addr; if (!xfs_perag_initialised_agi(pag)) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } /* * It's possible for these to be out of sync if * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || xfs_is_shutdown(pag_mount(pag))); if (agibpp) *agibpp = agibp; else xfs_trans_brelse(tp, agibp); return 0; } /* How many inodes are backed by inode clusters ondisk? */ STATIC int xfs_ialloc_count_ondisk( struct xfs_btree_cur *cur, xfs_agino_t low, xfs_agino_t high, unsigned int *allocated) { struct xfs_inobt_rec_incore irec; unsigned int ret = 0; int has_record; int error; error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record); if (error) return error; while (has_record) { unsigned int i, hole_idx; error = xfs_inobt_get_rec(cur, &irec, &has_record); if (error) return error; if (irec.ir_startino > high) break; for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { if (irec.ir_startino + i < low) continue; if (irec.ir_startino + i > high) break; hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT; if (!(irec.ir_holemask & (1U << hole_idx))) ret++; } error = xfs_btree_increment(cur, 0, &has_record); if (error) return error; } *allocated = ret; return 0; } /* Is there an inode record covering a given extent? */ int xfs_ialloc_has_inodes_at_extent( struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, enum xbtree_recpacking *outcome) { xfs_agino_t agino; xfs_agino_t last_agino; unsigned int allocated; int error; agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno); last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1; error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated); if (error) return error; if (allocated == 0) *outcome = XBTREE_RECPACKING_EMPTY; else if (allocated == last_agino - agino + 1) *outcome = XBTREE_RECPACKING_FULL; else *outcome = XBTREE_RECPACKING_SPARSE; return 0; } struct xfs_ialloc_count_inodes { xfs_agino_t count; xfs_agino_t freecount; }; /* Record inode counts across all inobt records. */ STATIC int xfs_ialloc_count_inodes_rec( struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv) { struct xfs_inobt_rec_incore irec; struct xfs_ialloc_count_inodes *ci = priv; xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); ci->count += irec.ir_count; ci->freecount += irec.ir_freecount; return 0; } /* Count allocated and free inodes under an inobt. */ int xfs_ialloc_count_inodes( struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount) { struct xfs_ialloc_count_inodes ci = {0}; int error; ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); if (error) return error; *count = ci.count; *freecount = ci.freecount; return 0; } /* * Initialize inode-related geometry information. * * Compute the inode btree min and max levels and set maxicount. * * Set the inode cluster size. This may still be overridden by the file * system block size if it is larger than the chosen cluster size. * * For v5 filesystems, scale the cluster size with the inode size to keep a * constant ratio of inode per cluster buffer, but only if mkfs has set the * inode alignment value appropriately for larger cluster sizes. * * Then compute the inode cluster alignment information. */ void xfs_ialloc_setup_geometry( struct xfs_mount *mp) { struct xfs_sb *sbp = &mp->m_sb; struct xfs_ino_geometry *igeo = M_IGEO(mp); uint64_t icount; uint inodes; igeo->new_diflags2 = 0; if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; if (xfs_has_large_extent_counts(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64; /* Compute inode btree geometry. */ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true); igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false); igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK, sbp->sb_inopblock); igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog; if (sbp->sb_spino_align) igeo->ialloc_min_blks = sbp->sb_spino_align; else igeo->ialloc_min_blks = igeo->ialloc_blks; /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk()); /* * Set the maximum inode count for this filesystem, being careful not * to use obviously garbage sb_inopblog/sb_inopblock values. Regular * users should never get here due to failing sb verification, but * certain users (xfs_db) need to be usable even with corrupt metadata. */ if (sbp->sb_imax_pct && igeo->ialloc_blks) { /* * Make sure the maximum inode count is a multiple * of the units we allocate inodes in. */ icount = sbp->sb_dblocks * sbp->sb_imax_pct; do_div(icount, 100); do_div(icount, igeo->ialloc_blks); igeo->maxicount = XFS_FSB_TO_INO(mp, icount * igeo->ialloc_blks); } else { igeo->maxicount = 0; } /* * Compute the desired size of an inode cluster buffer size, which * starts at 8K and (on v5 filesystems) scales up with larger inode * sizes. * * Preserve the desired inode cluster size because the sparse inodes * feature uses that desired size (not the actual size) to compute the * sparse inode alignment. The mount code validates this value, so we * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; if (xfs_has_v3inodes(mp)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) igeo->inode_cluster_size_raw = new_size; } /* Calculate inode cluster ratios. */ if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize) igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw); else igeo->blocks_per_cluster = 1; igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster); igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster); /* Calculate inode cluster alignment. */ if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster) igeo->cluster_align = mp->m_sb.sb_inoalignmt; else igeo->cluster_align = 1; igeo->inoalign_mask = igeo->cluster_align - 1; igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align); /* * If we are using stripe alignment, check whether * the stripe unit is a multiple of the inode alignment */ if (mp->m_dalign && igeo->inoalign_mask && !(mp->m_dalign & igeo->inoalign_mask)) igeo->ialloc_align = mp->m_dalign; else igeo->ialloc_align = 0; if (mp->m_sb.sb_blocksize > PAGE_SIZE) igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT; else igeo->min_folio_order = 0; } /* Compute the location of the root directory inode that is laid out by mkfs. */ xfs_ino_t xfs_ialloc_calc_rootino( struct xfs_mount *mp, int sunit) { struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agblock_t first_bno; /* * Pre-calculate the geometry of AG 0. We know what it looks like * because libxfs knows how to create allocation groups now. * * first_bno is the first block in which mkfs could possibly have * allocated the root directory inode, once we factor in the metadata * that mkfs formats before it. Namely, the four AG headers... */ first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); /* ...the two free space btree roots... */ first_bno += 2; /* ...the inode btree root... */ first_bno += 1; /* ...the initial AGFL... */ first_bno += xfs_alloc_min_freelist(mp, NULL); /* ...the free inode btree root... */ if (xfs_has_finobt(mp)) first_bno++; /* ...the reverse mapping btree root... */ if (xfs_has_rmapbt(mp)) first_bno++; /* ...the reference count btree... */ if (xfs_has_reflink(mp)) first_bno++; /* * ...and the log, if it is allocated in the first allocation group. * * This can happen with filesystems that only have a single * allocation group, or very odd geometries created by old mkfs * versions on very small filesystems. */ if (xfs_ag_contains_log(mp, 0)) first_bno += mp->m_sb.sb_logblocks; /* * Now round first_bno up to whatever allocation alignment is given * by the filesystem or was passed in. */ if (xfs_has_dalign(mp) && igeo->ialloc_align > 0) first_bno = roundup(first_bno, sunit); else if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt > 1) first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); } /* * Ensure there are not sparse inode clusters that cross the new EOAG. * * This is a no-op for non-spinode filesystems since clusters are always fully * allocated and checking the bnobt suffices. However, a spinode filesystem * could have a record where the upper inodes are free blocks. If those blocks * were removed from the filesystem, the inode record would extend beyond EOAG, * which will be flagged as corruption. */ int xfs_ialloc_check_shrink( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agibp, xfs_agblock_t new_length) { struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; xfs_agino_t agino; int has; int error; if (!xfs_has_sparseinodes(pag_mount(pag))) return 0; cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has); if (error || !has) goto out; error = xfs_inobt_get_rec(cur, &rec, &has); if (error) goto out; if (!has) { xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT); error = -EFSCORRUPTED; goto out; } /* If the record covers inodes that would be beyond EOFS, bail out. */ if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) { error = -ENOSPC; goto out; } out: xfs_btree_del_cursor(cur, error); return error; }
1 15 12 12 12 12 12 12 12 3 12 12 12 12 12 12 12 12 15 15 11 1 5 5 3 5 3 3 5 5 1 1 5 5 1 5 5 5 15 15 5 3 12 15 5 4 15 14 4 12 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/checkpoint.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1999 Red Hat Software --- All Rights Reserved * * Checkpoint routines for the generic filesystem journaling code. * Part of the ext2fs journaling system. * * Checkpointing is the process of ensuring that a section of the log is * committed fully to disk, so that that portion of the log can be * reused. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <trace/events/jbd2.h> /* * Unlink a buffer from a transaction checkpoint list. * * Called with j_list_lock held. */ static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction = jh->b_cp_transaction; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; if (transaction->t_checkpoint_list == jh) { transaction->t_checkpoint_list = jh->b_cpnext; if (transaction->t_checkpoint_list == jh) transaction->t_checkpoint_list = NULL; } } /* * __jbd2_log_wait_for_space: wait until there is space in the journal. * * Called under j-state_lock *only*. It will be unlocked if we have to wait * for a checkpoint to free up some space in the log. */ void __jbd2_log_wait_for_space(journal_t *journal) __acquires(&journal->j_state_lock) __releases(&journal->j_state_lock) { int nblocks, space_left; /* assert_spin_locked(&journal->j_state_lock); */ nblocks = journal->j_max_transaction_buffers; while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); mutex_lock_io(&journal->j_checkpoint_mutex); /* * Test again, another process may have checkpointed while we * were waiting for the checkpoint lock. If there are no * transactions ready to be checkpointed, try to recover * journal space by calling cleanup_journal_tail(), and if * that doesn't work, by waiting for the currently committing * transaction to complete. If there is absolutely no way * to make progress, this is either a BUG or corrupted * filesystem, so abort the journal and leave a stack * trace for forensic evidence. */ write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) { mutex_unlock(&journal->j_checkpoint_mutex); return; } spin_lock(&journal->j_list_lock); space_left = jbd2_log_space_left(journal); if (space_left < nblocks) { int chkpt = journal->j_checkpoint_transactions != NULL; tid_t tid = 0; bool has_transaction = false; if (journal->j_committing_transaction) { tid = journal->j_committing_transaction->t_tid; has_transaction = true; } spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); } else if (jbd2_cleanup_journal_tail(journal) <= 0) { /* * We were able to recover space or the * journal was aborted due to an error. */ ; } else if (has_transaction) { /* * jbd2_journal_commit_transaction() may want * to take the checkpoint_mutex if JBD2_FLUSHED * is set. So we need to temporarily drop it. */ mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); write_lock(&journal->j_state_lock); continue; } else { printk(KERN_ERR "%s: needed %d blocks and " "only had %d space available\n", __func__, nblocks, space_left); printk(KERN_ERR "%s: no way to get more " "journal space in %s\n", __func__, journal->j_devname); WARN_ON(1); jbd2_journal_abort(journal, -EIO); } write_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); } mutex_unlock(&journal->j_checkpoint_mutex); } } static void __flush_batch(journal_t *journal, int *batch_count) { int i; struct blk_plug plug; blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC); blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; BUFFER_TRACE(bh, "brelse"); __brelse(bh); journal->j_chkpt_bhs[i] = NULL; } *batch_count = 0; } /* * Perform an actual checkpoint. We take the first transaction on the * list of transactions to be checkpointed and send all its buffers * to disk. We submit larger chunks of data at once. * * The journal should be locked before calling this function. * Called with j_checkpoint_mutex held. */ int jbd2_log_do_checkpoint(journal_t *journal) { struct journal_head *jh; struct buffer_head *bh; transaction_t *transaction; tid_t this_tid; int result, batch_count = 0; jbd2_debug(1, "Start checkpoint\n"); /* * First thing: if there are any transactions in the log which * don't need checkpointing, just eliminate them from the * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); trace_jbd2_checkpoint(journal, result); jbd2_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; /* * OK, we need to start writing disk blocks. Take one transaction * and write it. */ spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; transaction = journal->j_checkpoint_transactions; if (transaction->t_chp_stats.cs_chp_time == 0) transaction->t_chp_stats.cs_chp_time = jiffies; this_tid = transaction->t_tid; restart: /* * If someone cleaned up this transaction while we slept, we're * done (maybe it's a new transaction, but it fell at the same * address). */ if (journal->j_checkpoint_transactions != transaction || transaction->t_tid != this_tid) goto out; /* checkpoint all of the transaction's buffers */ while (transaction->t_checkpoint_list) { jh = transaction->t_checkpoint_list; bh = jh2bh(jh); if (jh->b_transaction != NULL) { transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); if (unlikely(journal->j_flags & JBD2_UNMOUNT)) /* * The journal thread is dead; so * starting and waiting for a commit * to finish will cause us to wait for * a _very_ long time. */ printk(KERN_ERR "JBD2: %s: Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); if (batch_count) __flush_batch(journal, &batch_count); jbd2_log_start_commit(journal, tid); /* * jbd2_journal_commit_transaction() may want * to take the checkpoint_mutex if JBD2_FLUSHED * is set, jbd2_update_log_tail() called by * jbd2_journal_commit_transaction() may also take * checkpoint_mutex. So we need to temporarily * drop it. */ mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); mutex_lock_io(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); goto restart; } if (!trylock_buffer(bh)) { /* * The buffer is locked, it may be writing back, or * flushing out in the last couple of cycles, or * re-adding into a new transaction, need to check * it again until it's unlocked. */ get_bh(bh); spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); goto retry; } else if (!buffer_dirty(bh)) { unlock_buffer(bh); BUFFER_TRACE(bh, "remove from checkpoint"); /* * If the transaction was released or the checkpoint * list was empty, we're done. */ if (__jbd2_journal_remove_checkpoint(jh) || !transaction->t_checkpoint_list) goto out; } else { unlock_buffer(bh); /* * We are about to write the buffer, it could be * raced by some other transaction shrink or buffer * re-log logic once we release the j_list_lock, * leave it on the checkpoint list and check status * again to make sure it's clean. */ BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); journal->j_chkpt_bhs[batch_count++] = bh; transaction->t_chp_stats.cs_written++; transaction->t_checkpoint_list = jh->b_cpnext; } if ((batch_count == JBD2_NR_BATCH) || need_resched() || spin_needbreak(&journal->j_list_lock) || jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0]) goto unlock_and_flush; } if (batch_count) { unlock_and_flush: spin_unlock(&journal->j_list_lock); retry: if (batch_count) __flush_batch(journal, &batch_count); spin_lock(&journal->j_list_lock); goto restart; } out: spin_unlock(&journal->j_list_lock); result = jbd2_cleanup_journal_tail(journal); return (result < 0) ? result : 0; } /* * Check the list of checkpoint transactions for the journal to see if * we have already got rid of any since the last update of the log tail * in the journal superblock. If so, we can instantly roll the * superblock forward to remove those transactions from the log. * * Return <0 on error, 0 on success, 1 if there was nothing to clean up. * * Called with the journal lock held. * * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed * even in abort state, but we must not update the super block if * checkpointing may have failed. Otherwise, we would lose some metadata * buffers which should be written-back to the filesystem. */ int jbd2_cleanup_journal_tail(journal_t *journal) { tid_t first_tid; unsigned long blocknr; if (is_journal_aborted(journal)) return -EIO; if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; J_ASSERT(blocknr != 0); /* * We need to make sure that any blocks that were recently written out * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before * we drop the transactions from the journal. It's unlikely this will * be necessary, especially with an appropriately sized journal, but we * need this to guarantee correctness. Fortunately * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(journal->j_fs_dev); return __jbd2_update_log_tail(journal, first_tid, blocknr); } /* Checkpoint list management */ /* * journal_shrink_one_cp_list * * Find all the written-back checkpoint buffers in the given list * and try to release them. If the whole transaction is released, set * the 'released' parameter. Return the number of released checkpointed * buffers. * * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, enum jbd2_shrink_type type, bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; unsigned long nr_freed = 0; int ret; *released = false; if (!jh) return 0; last_jh = jh->b_cpprev; do { jh = next_jh; next_jh = jh->b_cpnext; if (type == JBD2_SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); if (ret < 0) { if (type == JBD2_SHRINK_BUSY_SKIP) continue; break; } } nr_freed++; if (ret) { *released = true; break; } if (need_resched()) break; } while (jh != last_jh); return nr_freed; } /* * jbd2_journal_shrink_checkpoint_list * * Find 'nr_to_scan' written-back checkpoint buffers in the journal * and try to release them. Return the number of released checkpointed * buffers. * * Called with j_list_lock held. */ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan) { transaction_t *transaction, *last_transaction, *next_transaction; bool __maybe_unused released; tid_t first_tid = 0, last_tid = 0, next_tid = 0; tid_t tid = 0; unsigned long nr_freed = 0; unsigned long freed; bool first_set = false; again: spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) { spin_unlock(&journal->j_list_lock); goto out; } /* * Get next shrink transaction, resume previous scan or start * over again. If some others do checkpoint and drop transaction * from the checkpoint list, we ignore saved j_shrink_transaction * and start over unconditionally. */ if (journal->j_shrink_transaction) transaction = journal->j_shrink_transaction; else transaction = journal->j_checkpoint_transactions; if (!first_set) { first_tid = transaction->t_tid; first_set = true; } last_transaction = journal->j_checkpoint_transactions->t_cpprev; next_transaction = transaction; last_tid = last_transaction->t_tid; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, JBD2_SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) break; if (need_resched() || spin_needbreak(&journal->j_list_lock)) break; } while (transaction != last_transaction); if (transaction != last_transaction) { journal->j_shrink_transaction = next_transaction; next_tid = next_transaction->t_tid; } else { journal->j_shrink_transaction = NULL; next_tid = 0; } spin_unlock(&journal->j_list_lock); cond_resched(); if (*nr_to_scan && journal->j_shrink_transaction) goto again; out: trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, nr_freed, next_tid); return nr_freed; } /* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function. * * Called with j_list_lock held. */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type) { transaction_t *transaction, *last_transaction, *next_transaction; bool released; WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP); transaction = journal->j_checkpoint_transactions; if (!transaction) return; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; journal_shrink_one_cp_list(transaction->t_checkpoint_list, type, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ if (need_resched()) return; /* * Stop scanning if we couldn't free the transaction. This * avoids pointless scanning of transactions which still * weren't checkpointed. */ if (!released) return; } while (transaction != last_transaction); } /* * Remove buffers from all checkpoint lists as journal is aborted and we just * need to free memory */ void jbd2_journal_destroy_checkpoint(journal_t *journal) { /* * We loop because __jbd2_journal_clean_checkpoint_list() may abort * early due to a need of rescheduling. */ while (1) { spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) { spin_unlock(&journal->j_list_lock); break; } __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY); spin_unlock(&journal->j_list_lock); cond_resched(); } } /* * journal_remove_checkpoint: called after a buffer has been committed * to disk (either by being write-back flushed to disk, or being * committed to the log). * * We cannot safely clean a transaction out of the log until all of the * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint * lists until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. * The function can free jh and bh. * * This function is called with j_list_lock held. */ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; JBUFFER_TRACE(jh, "entry"); transaction = jh->b_cp_transaction; if (!transaction) { JBUFFER_TRACE(jh, "not on transaction"); return 0; } journal = transaction->t_journal; JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; percpu_counter_dec(&journal->j_checkpoint_jh_count); jbd2_journal_put_journal_head(jh); /* Is this transaction empty? */ if (transaction->t_checkpoint_list) return 0; /* * There is one special case to worry about: if we have just pulled the * buffer off a running or committing transaction's checkpoing list, * then even if the checkpoint list is empty, the transaction obviously * cannot be dropped! * * The locking here around t_state is a bit sleazy. * See the comment at the end of jbd2_journal_commit_transaction(). */ if (transaction->t_state != T_FINISHED) return 0; /* * OK, that was the last buffer for the transaction, we can now * safely remove this transaction from the log. */ stats = &transaction->t_chp_stats; if (stats->cs_chp_time) stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, jiffies); trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); return 1; } /* * Check the checkpoint buffer and try to remove it from the checkpoint * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if * it frees the transaction, 0 otherwise. * * This function is called with j_list_lock held. */ int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); if (jh->b_transaction) return -EBUSY; if (!trylock_buffer(bh)) return -EBUSY; if (buffer_dirty(bh)) { unlock_buffer(bh); return -EBUSY; } unlock_buffer(bh); /* * Buffer is clean and the IO has finished (we held the buffer * lock) so the checkpoint is done. We can safely remove the * buffer from this transaction. */ JBUFFER_TRACE(jh, "remove from checkpoint list"); return __jbd2_journal_remove_checkpoint(jh); } /* * journal_insert_checkpoint: put a committed buffer onto a checkpoint * list so that we know when it is safe to clean the transaction out of * the log. * * Called with the journal locked. * Called with j_list_lock held. */ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, transaction_t *transaction) { JBUFFER_TRACE(jh, "entry"); J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); /* Get reference for checkpointing transaction */ jbd2_journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { jh->b_cpnext = jh->b_cpprev = jh; } else { jh->b_cpnext = transaction->t_checkpoint_list; jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; jh->b_cpprev->b_cpnext = jh; jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count); } /* * We've finished with this transaction structure: adios... * * The transaction must have no links except for the checkpoint by this * point. * * Called with the journal locked. * Called with j_list_lock held. */ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) { assert_spin_locked(&journal->j_list_lock); journal->j_shrink_transaction = NULL; if (transaction->t_cpnext) { transaction->t_cpnext->t_cpprev = transaction->t_cpprev; transaction->t_cpprev->t_cpnext = transaction->t_cpnext; if (journal->j_checkpoint_transactions == transaction) journal->j_checkpoint_transactions = transaction->t_cpnext; if (journal->j_checkpoint_transactions == transaction) journal->j_checkpoint_transactions = NULL; } J_ASSERT(transaction->t_state == T_FINISHED); J_ASSERT(transaction->t_buffers == NULL); J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); trace_jbd2_drop_transaction(journal, transaction); jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* V4L2 device support header. Copyright (C) 2008 Hans Verkuil <hverkuil@xs4all.nl> */ #ifndef _V4L2_DEVICE_H #define _V4L2_DEVICE_H #include <media/media-device.h> #include <media/v4l2-subdev.h> #include <media/v4l2-dev.h> struct v4l2_ctrl_handler; /** * struct v4l2_device - main struct to for V4L2 device drivers * * @dev: pointer to struct device. * @mdev: pointer to struct media_device, may be NULL. * @subdevs: used to keep track of the registered subdevs * @lock: lock this struct; can be used by the driver as well * if this struct is embedded into a larger struct. * @name: unique device name, by default the driver name + bus ID * @notify: notify operation called by some sub-devices. * @ctrl_handler: The control handler. May be %NULL. * @prio: Device's priority state * @ref: Keep track of the references to this struct. * @release: Release function that is called when the ref count * goes to 0. * * Each instance of a V4L2 device should create the v4l2_device struct, * either stand-alone or embedded in a larger struct. * * It allows easy access to sub-devices (see v4l2-subdev.h) and provides * basic V4L2 device-level support. * * .. note:: * * #) @dev->driver_data points to this struct. * #) @dev might be %NULL if there is no parent device */ struct v4l2_device { struct device *dev; struct media_device *mdev; struct list_head subdevs; spinlock_t lock; char name[36]; void (*notify)(struct v4l2_subdev *sd, unsigned int notification, void *arg); struct v4l2_ctrl_handler *ctrl_handler; struct v4l2_prio_state prio; struct kref ref; void (*release)(struct v4l2_device *v4l2_dev); }; /** * v4l2_device_get - gets a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to increment the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ static inline void v4l2_device_get(struct v4l2_device *v4l2_dev) { kref_get(&v4l2_dev->ref); } /** * v4l2_device_put - puts a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to decrement the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ int v4l2_device_put(struct v4l2_device *v4l2_dev); /** * v4l2_device_register - Initialize v4l2_dev and make @dev->driver_data * point to @v4l2_dev. * * @dev: pointer to struct &device * @v4l2_dev: pointer to struct &v4l2_device * * .. note:: * @dev may be %NULL in rare cases (ISA devices). * In such case the caller must fill in the @v4l2_dev->name field * before calling this function. */ int __must_check v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev); /** * v4l2_device_set_name - Optional function to initialize the * name field of struct &v4l2_device * * @v4l2_dev: pointer to struct &v4l2_device * @basename: base name for the device name * @instance: pointer to a static atomic_t var with the instance usage for * the device driver. * * v4l2_device_set_name() initializes the name field of struct &v4l2_device * using the driver name and a driver-global atomic_t instance. * * This function will increment the instance counter and returns the * instance value used in the name. * * Example: * * static atomic_t drv_instance = ATOMIC_INIT(0); * * ... * * instance = v4l2_device_set_name(&\ v4l2_dev, "foo", &\ drv_instance); * * The first time this is called the name field will be set to foo0 and * this function returns 0. If the name ends with a digit (e.g. cx18), * then the name will be set to cx18-0 since cx180 would look really odd. */ int v4l2_device_set_name(struct v4l2_device *v4l2_dev, const char *basename, atomic_t *instance); /** * v4l2_device_disconnect - Change V4L2 device state to disconnected. * * @v4l2_dev: pointer to struct v4l2_device * * Should be called when the USB parent disconnects. * Since the parent disappears, this ensures that @v4l2_dev doesn't have * an invalid parent pointer. * * .. note:: This function sets @v4l2_dev->dev to NULL. */ void v4l2_device_disconnect(struct v4l2_device *v4l2_dev); /** * v4l2_device_unregister - Unregister all sub-devices and any other * resources related to @v4l2_dev. * * @v4l2_dev: pointer to struct v4l2_device */ void v4l2_device_unregister(struct v4l2_device *v4l2_dev); /** * v4l2_device_register_subdev - Registers a subdev with a v4l2 device. * * @v4l2_dev: pointer to struct &v4l2_device * @sd: pointer to &struct v4l2_subdev * * While registered, the subdev module is marked as in-use. * * An error is returned if the module is no longer loaded on any attempts * to register it. */ #define v4l2_device_register_subdev(v4l2_dev, sd) \ __v4l2_device_register_subdev(v4l2_dev, sd, THIS_MODULE) int __must_check __v4l2_device_register_subdev(struct v4l2_device *v4l2_dev, struct v4l2_subdev *sd, struct module *module); /** * v4l2_device_unregister_subdev - Unregisters a subdev with a v4l2 device. * * @sd: pointer to &struct v4l2_subdev * * .. note :: * * Can also be called if the subdev wasn't registered. In such * case, it will do nothing. */ void v4l2_device_unregister_subdev(struct v4l2_subdev *sd); /** * __v4l2_device_register_subdev_nodes - Registers device nodes for * all subdevs of the v4l2 device that are marked with the * %V4L2_SUBDEV_FL_HAS_DEVNODE flag. * * @v4l2_dev: pointer to struct v4l2_device * @read_only: subdevices read-only flag. True to register the subdevices * device nodes in read-only mode, false to allow full access to the * subdevice userspace API. */ int __must_check __v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev, bool read_only); /** * v4l2_device_register_subdev_nodes - Registers subdevices device nodes with * unrestricted access to the subdevice userspace operations * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, false); #else return 0; #endif } /** * v4l2_device_register_ro_subdev_nodes - Registers subdevices device nodes * in read-only mode * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_ro_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, true); #else return 0; #endif } /** * v4l2_subdev_notify - Sends a notification to v4l2_device. * * @sd: pointer to &struct v4l2_subdev * @notification: type of notification. Please notice that the notification * type is driver-specific. * @arg: arguments for the notification. Those are specific to each * notification type. */ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd, unsigned int notification, void *arg) { if (sd && sd->v4l2_dev && sd->v4l2_dev->notify) sd->v4l2_dev->notify(sd, notification, arg); } /** * v4l2_device_supports_requests - Test if requests are supported. * * @v4l2_dev: pointer to struct v4l2_device */ static inline bool v4l2_device_supports_requests(struct v4l2_device *v4l2_dev) { return v4l2_dev->mdev && v4l2_dev->mdev->ops && v4l2_dev->mdev->ops->req_queue; } /* Helper macros to iterate over all subdevs. */ /** * v4l2_device_for_each_subdev - Helper macro that interates over all * sub-devices of a given &v4l2_device. * * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * * This macro iterates over all sub-devices owned by the @v4l2_dev device. * It acts as a for loop iterator and executes the next statement with * the @sd variable pointing to each sub-device in turn. */ #define v4l2_device_for_each_subdev(sd, v4l2_dev) \ list_for_each_entry(sd, &(v4l2_dev)->subdevs, list) /** * __v4l2_device_call_subdevs_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_p(v4l2_dev, sd, cond, o, f, args...) \ do { \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ (sd)->ops->o->f((sd) , ##args); \ } while (0) /** * __v4l2_device_call_subdevs - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs(v4l2_dev, cond, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ } while (0) /** * __v4l2_device_call_subdevs_until_err_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev sub-devices associated with @v4l2_dev. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, zero * otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err_p(v4l2_dev, sd, cond, o, f, args...) \ ({ \ long __err = 0; \ \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) { \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ __err = (sd)->ops->o->f((sd) , ##args); \ if (__err && __err != -ENOIOCTLCMD) \ break; \ } \ (__err == -ENOIOCTLCMD) ? 0 : __err; \ }) /** * __v4l2_device_call_subdevs_until_err - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err(v4l2_dev, cond, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ }) /** * v4l2_device_call_all - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_all(v4l2_dev, grpid, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ } while (0) /** * v4l2_device_call_until_err - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver, until an error occurs. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_until_err(v4l2_dev, grpid, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ }) /** * v4l2_device_mask_call_all - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_all(v4l2_dev, grpmsk, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ } while (0) /** * v4l2_device_mask_call_until_err - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_until_err(v4l2_dev, grpmsk, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ }) /** * v4l2_device_has_op - checks if any subdev with matching grpid has a * given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_has_op(v4l2_dev, grpid, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpid) && __sd->grp_id != (grpid)) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) /** * v4l2_device_mask_has_op - checks if any subdev with matching group * mask has a given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_mask_has_op(v4l2_dev, grpmsk, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpmsk) && !(__sd->grp_id & (grpmsk))) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #ifndef __IOMMUFD_PRIVATE_H #define __IOMMUFD_PRIVATE_H #include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/iova_bitmap.h> #include <linux/rwsem.h> #include <linux/uaccess.h> #include <linux/xarray.h> #include <uapi/linux/iommufd.h> #include "../iommu-priv.h" struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; struct iommufd_sw_msi_map { struct list_head sw_msi_item; phys_addr_t sw_msi_start; phys_addr_t msi_addr; unsigned int pgoff; unsigned int id; }; /* Bitmap of struct iommufd_sw_msi_map::id */ struct iommufd_sw_msi_maps { DECLARE_BITMAP(bitmap, 64); }; #ifdef CONFIG_IRQ_MSI_IOMMU int iommufd_sw_msi_install(struct iommufd_ctx *ictx, struct iommufd_hwpt_paging *hwpt_paging, struct iommufd_sw_msi_map *msi_map); #endif struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; struct mutex sw_msi_lock; struct list_head sw_msi_list; unsigned int sw_msi_id; u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; struct iommufd_ioas *vfio_ioas; }; /* * The IOVA to PFN map. The map automatically copies the PFNs into multiple * domains and permits sharing of PFNs between io_pagetable instances. This * supports both a design where IOAS's are 1:1 with a domain (eg because the * domain is HW customized), or where the IOAS is 1:N with multiple generic * domains. The io_pagetable holds an interval tree of iopt_areas which point * to shared iopt_pages which hold the pfns mapped to the page table. * * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex */ struct io_pagetable { struct rw_semaphore domains_rwsem; struct xarray domains; struct xarray access_list; unsigned int next_domain_id; struct rw_semaphore iova_rwsem; struct rb_root_cached area_itree; /* IOVA that cannot become reserved, struct iopt_allowed */ struct rb_root_cached allowed_itree; /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; unsigned long iova_alignment; }; void iopt_init_table(struct io_pagetable *iopt); void iopt_destroy_table(struct io_pagetable *iopt); int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list); void iopt_free_pages_list(struct list_head *pages_list); enum { IOPT_ALLOC_IOVA = 1 << 0, }; int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, struct file *file, unsigned long start, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags); int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap); int iopt_set_dirty_tracking(struct io_pagetable *iopt, struct iommu_domain *domain, bool enable); void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain); void iopt_table_remove_domain(struct io_pagetable *iopt, struct iommu_domain *domain); int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, struct device *dev, phys_addr_t *sw_msi_start); int iopt_set_allow_iova(struct io_pagetable *iopt, struct rb_root_cached *allowed_iova); int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, unsigned long last, void *owner); void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner); int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, size_t num_iovas); void iopt_enable_large_pages(struct io_pagetable *iopt); int iopt_disable_large_pages(struct io_pagetable *iopt); struct iommufd_ucmd { struct iommufd_ctx *ictx; void __user *ubuffer; u32 user_size; void *cmd; }; int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, unsigned long arg); /* Copy the response in ucmd->cmd back to userspace. */ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, size_t cmd_len) { if (copy_to_user(ucmd->ubuffer, ucmd->cmd, min_t(size_t, ucmd->user_size, cmd_len))) return -EFAULT; return 0; } static inline bool iommufd_lock_obj(struct iommufd_object *obj) { if (!refcount_inc_not_zero(&obj->users)) return false; if (!refcount_inc_not_zero(&obj->shortterm_users)) { /* * If the caller doesn't already have a ref on obj this must be * called under the xa_lock. Otherwise the caller is holding a * ref on users. Thus it cannot be one before this decrement. */ refcount_dec(&obj->users); return false; } return true; } struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, enum iommufd_object_type type); static inline void iommufd_put_object(struct iommufd_ctx *ictx, struct iommufd_object *obj) { /* * Users first, then shortterm so that REMOVE_WAIT_SHORTTERM never sees * a spurious !0 users with a 0 shortterm_users. */ refcount_dec(&obj->users); if (refcount_dec_and_test(&obj->shortterm_users)) wake_up_interruptible_all(&ictx->destroy_wait); } void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj); enum { REMOVE_WAIT_SHORTTERM = 1, }; int iommufd_object_remove(struct iommufd_ctx *ictx, struct iommufd_object *to_destroy, u32 id, unsigned int flags); /* * The caller holds a users refcount and wants to destroy the object. At this * point the caller has no shortterm_users reference and at least the xarray * will be holding one. */ static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { int ret; ret = iommufd_object_remove(ictx, obj, obj->id, REMOVE_WAIT_SHORTTERM); /* * If there is a bug and we couldn't destroy the object then we did put * back the caller's users refcount and will eventually try to free it * again during close. */ WARN_ON(ret); } /* * The HWPT allocated by autodomains is used in possibly many devices and * is automatically destroyed when its refcount reaches zero. * * If userspace uses the HWPT manually, even for a short term, then it will * disrupt this refcounting and the auto-free in the kernel will not work. * Userspace that tries to use the automatically allocated HWPT must be careful * to ensure that it is consistently destroyed, eg by not racing accesses * and by not attaching an automatic HWPT to a device manually. */ static inline void iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj) { iommufd_object_remove(ictx, obj, obj->id, 0); } #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ offsetof(typeof(*(ptr)), \ obj) != 0), \ type), \ typeof(*(ptr)), obj) #define iommufd_object_alloc(ictx, ptr, type) \ __iommufd_object_alloc(ictx, ptr, type, obj) /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The * mapping is copied into all of the associated domains and made available to * in-kernel users. * * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable * object. When we go to attach a device to an IOAS we need to get an * iommu_domain and wrapping iommufd_hw_pagetable for it. * * An iommu_domain & iommfd_hw_pagetable will be automatically selected * for a device based on the hwpt_list. If no suitable iommu_domain * is found a new iommu_domain will be created. */ struct iommufd_ioas { struct iommufd_object obj; struct io_pagetable iopt; struct mutex mutex; struct list_head hwpt_list; }; static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx, u32 id) { return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS), struct iommufd_ioas, obj); } struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_ioas_destroy(struct iommufd_object *obj); int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd); int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommu_hwpt_get_dirty_bitmap *bitmap); /* * A HW pagetable is called an iommu_domain inside the kernel. This user object * allows directly creating and inspecting the domains. Domains that have kernel * owned page tables will be associated with an iommufd_ioas that provides the * IOVA to PFN map. */ struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; struct iommufd_fault *fault; bool pasid_compat : 1; }; struct iommufd_hwpt_paging { struct iommufd_hw_pagetable common; struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { struct iommufd_hw_pagetable common; struct iommufd_hwpt_paging *parent; struct iommufd_viommu *viommu; }; static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) { return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; } static inline struct iommufd_hwpt_paging * to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) { return container_of(hwpt, struct iommufd_hwpt_paging, common); } static inline struct iommufd_hwpt_nested * to_hwpt_nested(struct iommufd_hw_pagetable *hwpt) { return container_of(hwpt, struct iommufd_hwpt_nested, common); } static inline struct iommufd_hwpt_paging * find_hwpt_paging(struct iommufd_hw_pagetable *hwpt) { switch (hwpt->obj.type) { case IOMMUFD_OBJ_HWPT_PAGING: return to_hwpt_paging(hwpt); case IOMMUFD_OBJ_HWPT_NESTED: return to_hwpt_nested(hwpt)->parent; default: return NULL; } } static inline struct iommufd_hwpt_paging * iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_HWPT_PAGING), struct iommufd_hwpt_paging, common.obj); } static inline struct iommufd_hw_pagetable * iommufd_get_hwpt_nested(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_HWPT_NESTED), struct iommufd_hw_pagetable, obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, ioasid_t pasid, u32 flags, bool immediate_attach, const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev, ioasid_t pasid); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); lockdep_assert_not_held(&hwpt_paging->ioas->mutex); if (hwpt_paging->auto_domain) { iommufd_object_put_and_try_destroy(ictx, &hwpt->obj); return; } } refcount_dec(&hwpt->obj.users); } struct iommufd_attach; struct iommufd_group { struct kref ref; struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; struct xarray pasid_attach; struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; /* * A iommufd_device object represents the binding relationship between a * consuming driver and the iommufd. These objects are created/destroyed by * external drivers, not by userspace. */ struct iommufd_device { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_group *igroup; struct list_head group_item; /* always the physical device */ struct device *dev; bool enforce_cache_coherency; /* protect iopf_enabled counter */ struct mutex iopf_lock; unsigned int iopf_enabled; }; static inline struct iommufd_device * iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_DEVICE), struct iommufd_device, obj); } void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_ioas *ioas; struct iommufd_ioas *ioas_unpin; struct mutex ioas_lock; const struct iommufd_access_ops *ops; void *data; unsigned long iova_alignment; u32 iopt_access_list_id; }; int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); struct iommufd_eventq { struct iommufd_object obj; struct iommufd_ctx *ictx; struct file *filep; spinlock_t lock; /* protects the deliver list */ struct list_head deliver; struct wait_queue_head wait_queue; }; struct iommufd_attach_handle { struct iommu_attach_handle handle; struct iommufd_device *idev; }; /* Convert an iommu attach handle to iommufd handle. */ #define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle) /* * An iommufd_fault object represents an interface to deliver I/O page faults * to the user space. These objects are created/destroyed by the user space and * associated with hardware page table objects during page-table allocation. */ struct iommufd_fault { struct iommufd_eventq common; struct mutex mutex; /* serializes response flows */ struct xarray response; }; static inline struct iommufd_fault * eventq_to_fault(struct iommufd_eventq *eventq) { return container_of(eventq, struct iommufd_fault, common); } static inline struct iommufd_fault * iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_FAULT), struct iommufd_fault, common.obj); } int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); void iommufd_fault_destroy(struct iommufd_object *obj); int iommufd_fault_iopf_handler(struct iopf_group *group); int iommufd_fault_iopf_enable(struct iommufd_device *idev); void iommufd_fault_iopf_disable(struct iommufd_device *idev); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle); /* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */ struct iommufd_vevent { struct iommufd_vevent_header header; struct list_head node; /* for iommufd_eventq::deliver */ ssize_t data_len; u64 event_data[] __counted_by(data_len); }; #define vevent_for_lost_events_header(vevent) \ (vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS) /* * An iommufd_veventq object represents an interface to deliver vIOMMU events to * the user space. It is created/destroyed by the user space and associated with * a vIOMMU object during the allocations. */ struct iommufd_veventq { struct iommufd_eventq common; struct iommufd_viommu *viommu; struct list_head node; /* for iommufd_viommu::veventqs */ struct iommufd_vevent lost_events_header; unsigned int type; unsigned int depth; /* Use common.lock for protection */ u32 num_events; u32 sequence; }; static inline struct iommufd_veventq * eventq_to_veventq(struct iommufd_eventq *eventq) { return container_of(eventq, struct iommufd_veventq, common); } static inline struct iommufd_veventq * iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_VEVENTQ), struct iommufd_veventq, common.obj); } int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd); void iommufd_veventq_destroy(struct iommufd_object *obj); void iommufd_veventq_abort(struct iommufd_object *obj); static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq, struct iommufd_vevent *vevent) { struct iommufd_eventq *eventq = &veventq->common; lockdep_assert_held(&eventq->lock); /* * Remove the lost_events_header and add the new node at the same time. * Note the new node can be lost_events_header, for a sequence update. */ if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver)) list_del(&veventq->lost_events_header.node); list_add_tail(&vevent->node, &eventq->deliver); vevent->header.sequence = veventq->sequence; veventq->sequence = (veventq->sequence + 1) & INT_MAX; wake_up_interruptible(&eventq->wait_queue); } static inline struct iommufd_viommu * iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_VIOMMU), struct iommufd_viommu, obj); } static inline struct iommufd_veventq * iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, u32 type) { struct iommufd_veventq *veventq, *next; lockdep_assert_held(&viommu->veventqs_rwsem); list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) { if (veventq->type == type) return veventq; } return NULL; } int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); struct iommufd_vdevice { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_viommu *viommu; struct device *dev; u64 id; /* per-vIOMMU virtual ID */ }; #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); extern size_t iommufd_test_memory_limit; void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags); bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags) { } static inline bool iommufd_should_fail(void) { return false; } static inline int __init iommufd_test_init(void) { return 0; } static inline void iommufd_test_exit(void) { } static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } #endif #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PART_STAT_H #define _LINUX_PART_STAT_H #include <linux/blkdev.h> #include <asm/local.h> struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; unsigned long sectors[NR_STAT_GROUPS]; unsigned long ios[NR_STAT_GROUPS]; unsigned long merges[NR_STAT_GROUPS]; unsigned long io_ticks; local_t in_flight[2]; }; /* * Macros to operate on percpu disk statistics: * * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should * be called between disk_stat_lock() and disk_stat_unlock(). * * part_stat_read() can be called at any time. */ #define part_stat_lock() preempt_disable() #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ (per_cpu_ptr((part)->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ TYPEOF_UNQUAL((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ res; \ }) static inline void part_stat_set_all(struct block_device *part, int value) { int i; for_each_possible_cpu(i) memset(per_cpu_ptr(part->bd_stats, i), value, sizeof(struct disk_stats)); } #define part_stat_read_accum(part, field) \ (part_stat_read(part, field[STAT_READ]) + \ part_stat_read(part, field[STAT_WRITE]) + \ part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ __this_cpu_add((part)->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ if (bdev_is_partition(part)) \ __part_stat_add(bdev_whole(part), field, addnd); \ } while (0) #define part_stat_dec(part, field) \ part_stat_add(part, field, -1) #define part_stat_inc(part, field) \ part_stat_add(part, field, 1) #define part_stat_sub(part, field, subnd) \ part_stat_add(part, field, -subnd) #define part_stat_local_dec(part, field) \ local_dec(&(part_stat_get(part, field))) #define part_stat_local_inc(part, field) \ local_inc(&(part_stat_get(part, field))) #define part_stat_local_read(part, field) \ local_read(&(part_stat_get(part, field))) #define part_stat_local_read_cpu(part, field, cpu) \ local_read(&(part_stat_get_cpu(part, field, cpu))) #endif /* _LINUX_PART_STAT_H */
10 103 103 103 74 187 187 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "clock.h" #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/preempt.h> static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args) { struct io_timer **_l = (struct io_timer **)l; struct io_timer **_r = (struct io_timer **)r; return (*_l)->expire < (*_r)->expire; } static const struct min_heap_callbacks callbacks = { .less = io_timer_cmp, .swp = NULL, }; void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { spin_lock(&clock->timer_lock); if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); return; } for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) goto out; BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); out: spin_unlock(&clock->timer_lock); } void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { spin_lock(&clock->timer_lock); for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) { min_heap_del(&clock->timers, i, &callbacks, NULL); break; } spin_unlock(&clock->timer_lock); } struct io_clock_wait { struct io_timer io_timer; struct timer_list cpu_timer; struct task_struct *task; int expired; }; static void io_clock_wait_fn(struct io_timer *timer) { struct io_clock_wait *wait = container_of(timer, struct io_clock_wait, io_timer); wait->expired = 1; wake_up_process(wait->task); } static void io_clock_cpu_timeout(struct timer_list *timer) { struct io_clock_wait *wait = container_of(timer, struct io_clock_wait, cpu_timer); wait->expired = 1; wake_up_process(wait->task); } void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { struct io_clock_wait wait = { .io_timer.expire = until, .io_timer.fn = io_clock_wait_fn, .io_timer.fn2 = (void *) _RET_IP_, .task = current, }; bch2_io_timer_add(clock, &wait.io_timer); schedule(); bch2_io_timer_del(clock, &wait.io_timer); } void bch2_kthread_io_clock_wait(struct io_clock *clock, u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait = { .io_timer.expire = io_until, .io_timer.fn = io_clock_wait_fn, .io_timer.fn2 = (void *) _RET_IP_, .task = current, }; bch2_io_timer_add(clock, &wait.io_timer); timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); do { set_current_state(TASK_INTERRUPTIBLE); if (kthread && kthread_should_stop()) break; if (wait.expired) break; schedule(); try_to_freeze(); } while (0); __set_current_state(TASK_RUNNING); timer_delete_sync(&wait.cpu_timer); destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); } static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; if (clock->timers.nr && time_after_eq64(now, clock->timers.data[0]->expire)) { ret = *min_heap_peek(&clock->timers); min_heap_pop(&clock->timers, &callbacks, NULL); } return ret; } void __bch2_increment_clock(struct io_clock *clock, u64 sectors) { struct io_timer *timer; u64 now = atomic64_add_return(sectors, &clock->now); spin_lock(&clock->timer_lock); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { out->atomic++; spin_lock(&clock->timer_lock); u64 now = atomic64_read(&clock->now); printbuf_tabstop_push(out, 40); prt_printf(out, "current time:\t%llu\n", now); for (unsigned i = 0; i < clock->timers.nr; i++) prt_printf(out, "%ps %ps:\t%llu\n", clock->timers.data[i]->fn, clock->timers.data[i]->fn2, clock->timers.data[i]->expire); spin_unlock(&clock->timer_lock); --out->atomic; } void bch2_io_clock_exit(struct io_clock *clock) { free_heap(&clock->timers); free_percpu(clock->pcpu_buf); } int bch2_io_clock_init(struct io_clock *clock) { atomic64_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); if (!clock->pcpu_buf) return -BCH_ERR_ENOMEM_io_clock_init; if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) return -BCH_ERR_ENOMEM_io_clock_init; return 0; }
2 188 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_RTBITMAP_H__ #define __XFS_RTBITMAP_H__ #include "xfs_rtgroup.h" struct xfs_rtalloc_args { struct xfs_rtgroup *rtg; struct xfs_mount *mp; struct xfs_trans *tp; struct xfs_buf *rbmbp; /* bitmap block buffer */ struct xfs_buf *sumbp; /* summary block buffer */ xfs_fileoff_t rbmoff; /* bitmap block number */ xfs_fileoff_t sumoff; /* summary block number */ }; static inline xfs_rtblock_t xfs_rtx_to_rtb( struct xfs_rtgroup *rtg, xfs_rtxnum_t rtx) { struct xfs_mount *mp = rtg_mount(rtg); xfs_rtblock_t start = xfs_group_start_fsb(rtg_group(rtg)); if (mp->m_rtxblklog >= 0) return start + (rtx << mp->m_rtxblklog); return start + (rtx * mp->m_sb.sb_rextsize); } /* Convert an rgbno into an rt extent number. */ static inline xfs_rtxnum_t xfs_rgbno_to_rtx( struct xfs_mount *mp, xfs_rgblock_t rgbno) { if (likely(mp->m_rtxblklog >= 0)) return rgbno >> mp->m_rtxblklog; return rgbno / mp->m_sb.sb_rextsize; } static inline uint64_t xfs_rtbxlen_to_blen( struct xfs_mount *mp, xfs_rtbxlen_t rtbxlen) { if (mp->m_rtxblklog >= 0) return rtbxlen << mp->m_rtxblklog; return rtbxlen * mp->m_sb.sb_rextsize; } static inline xfs_extlen_t xfs_rtxlen_to_extlen( struct xfs_mount *mp, xfs_rtxlen_t rtxlen) { if (mp->m_rtxblklog >= 0) return rtxlen << mp->m_rtxblklog; return rtxlen * mp->m_sb.sb_rextsize; } /* Compute the misalignment between an extent length and a realtime extent .*/ static inline unsigned int xfs_extlen_to_rtxmod( struct xfs_mount *mp, xfs_extlen_t len) { if (mp->m_rtxblklog >= 0) return len & mp->m_rtxblkmask; return len % mp->m_sb.sb_rextsize; } static inline xfs_rtxlen_t xfs_extlen_to_rtxlen( struct xfs_mount *mp, xfs_extlen_t len) { if (mp->m_rtxblklog >= 0) return len >> mp->m_rtxblklog; return len / mp->m_sb.sb_rextsize; } /* Convert an rt block count into an rt extent count. */ static inline xfs_rtbxlen_t xfs_blen_to_rtbxlen( struct xfs_mount *mp, uint64_t blen) { if (likely(mp->m_rtxblklog >= 0)) return blen >> mp->m_rtxblklog; return div_u64(blen, mp->m_sb.sb_rextsize); } /* Return the offset of a file block length within an rt extent. */ static inline xfs_extlen_t xfs_blen_to_rtxoff( struct xfs_mount *mp, xfs_filblks_t blen) { if (likely(mp->m_rtxblklog >= 0)) return blen & mp->m_rtxblkmask; return do_div(blen, mp->m_sb.sb_rextsize); } /* Round this block count up to the nearest rt extent size. */ static inline xfs_filblks_t xfs_blen_roundup_rtx( struct xfs_mount *mp, xfs_filblks_t blen) { return roundup_64(blen, mp->m_sb.sb_rextsize); } /* Convert an rt block number into an rt extent number. */ static inline xfs_rtxnum_t xfs_rtb_to_rtx( struct xfs_mount *mp, xfs_rtblock_t rtbno) { /* open-coded 64-bit masking operation */ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno >> mp->m_rtxblklog; return div_u64(rtbno, mp->m_sb.sb_rextsize); } /* Return the offset of a rtgroup block number within an rt extent. */ static inline xfs_extlen_t xfs_rgbno_to_rtxoff( struct xfs_mount *mp, xfs_rgblock_t rgbno) { return rgbno % mp->m_sb.sb_rextsize; } /* Return the offset of an rt block number within an rt extent. */ static inline xfs_extlen_t xfs_rtb_to_rtxoff( struct xfs_mount *mp, xfs_rtblock_t rtbno) { /* open-coded 64-bit masking operation */ rtbno &= mp->m_groups[XG_TYPE_RTG].blkmask; if (likely(mp->m_rtxblklog >= 0)) return rtbno & mp->m_rtxblkmask; return do_div(rtbno, mp->m_sb.sb_rextsize); } /* Round this file block offset up to the nearest rt extent size. */ static inline xfs_rtblock_t xfs_fileoff_roundup_rtx( struct xfs_mount *mp, xfs_fileoff_t off) { return roundup_64(off, mp->m_sb.sb_rextsize); } /* Round this file block offset down to the nearest rt extent size. */ static inline xfs_rtblock_t xfs_fileoff_rounddown_rtx( struct xfs_mount *mp, xfs_fileoff_t off) { return rounddown_64(off, mp->m_sb.sb_rextsize); } /* Convert an rt extent number to a file block offset in the rt bitmap file. */ static inline xfs_fileoff_t xfs_rtx_to_rbmblock( struct xfs_mount *mp, xfs_rtxnum_t rtx) { if (xfs_has_rtgroups(mp)) return div_u64(rtx, mp->m_rtx_per_rbmblock); return rtx >> mp->m_blkbit_log; } /* Convert an rt extent number to a word offset within an rt bitmap block. */ static inline unsigned int xfs_rtx_to_rbmword( struct xfs_mount *mp, xfs_rtxnum_t rtx) { if (xfs_has_rtgroups(mp)) { unsigned int mod; div_u64_rem(rtx >> XFS_NBWORDLOG, mp->m_blockwsize, &mod); return mod; } return (rtx >> XFS_NBWORDLOG) & (mp->m_blockwsize - 1); } /* Convert a file block offset in the rt bitmap file to an rt extent number. */ static inline xfs_rtxnum_t xfs_rbmblock_to_rtx( struct xfs_mount *mp, xfs_fileoff_t rbmoff) { if (xfs_has_rtgroups(mp)) return rbmoff * mp->m_rtx_per_rbmblock; return rbmoff << mp->m_blkbit_log; } /* Return a pointer to a bitmap word within a rt bitmap block. */ static inline union xfs_rtword_raw * xfs_rbmblock_wordptr( struct xfs_rtalloc_args *args, unsigned int index) { struct xfs_mount *mp = args->mp; union xfs_rtword_raw *words; struct xfs_rtbuf_blkinfo *hdr = args->rbmbp->b_addr; if (xfs_has_rtgroups(mp)) words = (union xfs_rtword_raw *)(hdr + 1); else words = args->rbmbp->b_addr; return words + index; } /* Convert an ondisk bitmap word to its incore representation. */ static inline xfs_rtword_t xfs_rtbitmap_getword( struct xfs_rtalloc_args *args, unsigned int index) { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); if (xfs_has_rtgroups(args->mp)) return be32_to_cpu(word->rtg); return word->old; } /* Set an ondisk bitmap word from an incore representation. */ static inline void xfs_rtbitmap_setword( struct xfs_rtalloc_args *args, unsigned int index, xfs_rtword_t value) { union xfs_rtword_raw *word = xfs_rbmblock_wordptr(args, index); if (xfs_has_rtgroups(args->mp)) word->rtg = cpu_to_be32(value); else word->old = value; } /* * Convert a rt extent length and rt bitmap block number to a xfs_suminfo_t * offset within the rt summary file. */ static inline xfs_rtsumoff_t xfs_rtsumoffs( struct xfs_mount *mp, int log2_len, xfs_fileoff_t rbmoff) { return log2_len * mp->m_sb.sb_rbmblocks + rbmoff; } /* * Convert an xfs_suminfo_t offset to a file block offset within the rt summary * file. */ static inline xfs_fileoff_t xfs_rtsumoffs_to_block( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { if (xfs_has_rtgroups(mp)) return rsumoff / mp->m_blockwsize; return XFS_B_TO_FSBT(mp, rsumoff * sizeof(xfs_suminfo_t)); } /* * Convert an xfs_suminfo_t offset to an info word offset within an rt summary * block. */ static inline unsigned int xfs_rtsumoffs_to_infoword( struct xfs_mount *mp, xfs_rtsumoff_t rsumoff) { unsigned int mask = mp->m_blockmask >> XFS_SUMINFOLOG; if (xfs_has_rtgroups(mp)) return rsumoff % mp->m_blockwsize; return rsumoff & mask; } /* Return a pointer to a summary info word within a rt summary block. */ static inline union xfs_suminfo_raw * xfs_rsumblock_infoptr( struct xfs_rtalloc_args *args, unsigned int index) { union xfs_suminfo_raw *info; struct xfs_rtbuf_blkinfo *hdr = args->sumbp->b_addr; if (xfs_has_rtgroups(args->mp)) info = (union xfs_suminfo_raw *)(hdr + 1); else info = args->sumbp->b_addr; return info + index; } /* Get the current value of a summary counter. */ static inline xfs_suminfo_t xfs_suminfo_get( struct xfs_rtalloc_args *args, unsigned int index) { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); if (xfs_has_rtgroups(args->mp)) return be32_to_cpu(info->rtg); return info->old; } /* Add to the current value of a summary counter and return the new value. */ static inline xfs_suminfo_t xfs_suminfo_add( struct xfs_rtalloc_args *args, unsigned int index, int delta) { union xfs_suminfo_raw *info = xfs_rsumblock_infoptr(args, index); if (xfs_has_rtgroups(args->mp)) { be32_add_cpu(&info->rtg, delta); return be32_to_cpu(info->rtg); } info->old += delta; return info->old; } static inline const struct xfs_buf_ops * xfs_rtblock_ops( struct xfs_mount *mp, enum xfs_rtg_inodes type) { if (xfs_has_rtgroups(mp)) { if (type == XFS_RTGI_SUMMARY) return &xfs_rtsummary_buf_ops; return &xfs_rtbitmap_buf_ops; } return &xfs_rtbuf_ops; } /* * Functions for walking free space rtextents in the realtime bitmap. */ struct xfs_rtalloc_rec { xfs_rtxnum_t ar_startext; xfs_rtbxlen_t ar_extcount; }; typedef int (*xfs_rtalloc_query_range_fn)( struct xfs_rtgroup *rtg, struct xfs_trans *tp, const struct xfs_rtalloc_rec *rec, void *priv); #ifdef CONFIG_XFS_RT void xfs_rtbuf_cache_relse(struct xfs_rtalloc_args *args); int xfs_rtbitmap_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); int xfs_rtsummary_read_buf(struct xfs_rtalloc_args *args, xfs_fileoff_t block); int xfs_rtcheck_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val, xfs_rtxnum_t *new, int *stat); int xfs_rtfind_back(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t *rtblock); int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val); int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, xfs_suminfo_t *sum); int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len); int xfs_rtalloc_query_range(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxnum_t end, xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_query_all(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_extent_is_free(struct xfs_rtgroup *rtg, struct xfs_trans *tp, xfs_rtxnum_t start, xfs_rtxlen_t len, bool *is_free); int xfs_rtfree_extent(struct xfs_trans *tp, struct xfs_rtgroup *rtg, xfs_rtxnum_t start, xfs_rtxlen_t len); /* Same as above, but in units of rt blocks. */ int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); xfs_rtxnum_t xfs_rtbitmap_rtx_per_rbmblock(struct xfs_mount *mp); xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp); xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); xfs_filblks_t xfs_rtsummary_blockcount(struct xfs_mount *mp, unsigned int *rsumlevels); int xfs_rtfile_initialize_blocks(struct xfs_rtgroup *rtg, enum xfs_rtg_inodes type, xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb, void *data); int xfs_rtbitmap_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, struct xfs_trans *tp, bool init); int xfs_rtsummary_create(struct xfs_rtgroup *rtg, struct xfs_inode *ip, struct xfs_trans *tp, bool init); #else /* CONFIG_XFS_RT */ # define xfs_rtfree_extent(t,b,l) (-ENOSYS) static inline int xfs_rtfree_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, xfs_fsblock_t rtbno, xfs_filblks_t rtlen) { return -ENOSYS; } # define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) # define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) # define xfs_rtbitmap_read_buf(a,b) (-ENOSYS) # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) static inline xfs_filblks_t xfs_rtbitmap_blockcount_len(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { /* shut up gcc */ return 0; } #endif /* CONFIG_XFS_RT */ #endif /* __XFS_RTBITMAP_H__ */
2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 // SPDX-License-Identifier: GPL-2.0-or-later /* * Core registration and callback routines for MTD * drivers and users. * * Copyright © 1999-2010 David Woodhouse <dwmw2@infradead.org> * Copyright © 2006 Red Hat UK Limited */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/ioctl.h> #include <linux/init.h> #include <linux/of.h> #include <linux/proc_fs.h> #include <linux/idr.h> #include <linux/backing-dev.h> #include <linux/gfp.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/reboot.h> #include <linux/leds.h> #include <linux/debugfs.h> #include <linux/nvmem-provider.h> #include <linux/root_dev.h> #include <linux/error-injection.h> #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> #include "mtdcore.h" struct backing_dev_info *mtd_bdi; #ifdef CONFIG_PM_SLEEP static int mtd_cls_suspend(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); return mtd ? mtd_suspend(mtd) : 0; } static int mtd_cls_resume(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); if (mtd) mtd_resume(mtd); return 0; } static SIMPLE_DEV_PM_OPS(mtd_cls_pm_ops, mtd_cls_suspend, mtd_cls_resume); #define MTD_CLS_PM_OPS (&mtd_cls_pm_ops) #else #define MTD_CLS_PM_OPS NULL #endif static struct class mtd_class = { .name = "mtd", .pm = MTD_CLS_PM_OPS, }; static DEFINE_IDR(mtd_idr); /* These are exported solely for the purpose of mtd_blkdevs.c. You should not use them for _anything_ else */ DEFINE_MUTEX(mtd_table_mutex); EXPORT_SYMBOL_GPL(mtd_table_mutex); struct mtd_info *__mtd_next_device(int i) { return idr_get_next(&mtd_idr, &i); } EXPORT_SYMBOL_GPL(__mtd_next_device); static LIST_HEAD(mtd_notifiers); #define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2) /* REVISIT once MTD uses the driver model better, whoever allocates * the mtd_info will probably want to use the release() hook... */ static void mtd_release(struct device *dev) { struct mtd_info *mtd = dev_get_drvdata(dev); dev_t index = MTD_DEVT(mtd->index); idr_remove(&mtd_idr, mtd->index); of_node_put(mtd_get_of_node(mtd)); if (mtd_is_partition(mtd)) release_mtd_partition(mtd); /* remove /dev/mtdXro node */ device_destroy(&mtd_class, index + 1); } static void mtd_device_release(struct kref *kref) { struct mtd_info *mtd = container_of(kref, struct mtd_info, refcnt); bool is_partition = mtd_is_partition(mtd); debugfs_remove_recursive(mtd->dbg.dfs_dir); /* Try to remove the NVMEM provider */ nvmem_unregister(mtd->nvmem); device_unregister(&mtd->dev); /* * Clear dev so mtd can be safely re-registered later if desired. * Should not be done for partition, * as it was already destroyed in device_unregister(). */ if (!is_partition) memset(&mtd->dev, 0, sizeof(mtd->dev)); module_put(THIS_MODULE); } #define MTD_DEVICE_ATTR_RO(name) \ static DEVICE_ATTR(name, 0444, mtd_##name##_show, NULL) #define MTD_DEVICE_ATTR_RW(name) \ static DEVICE_ATTR(name, 0644, mtd_##name##_show, mtd_##name##_store) static ssize_t mtd_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); char *type; switch (mtd->type) { case MTD_ABSENT: type = "absent"; break; case MTD_RAM: type = "ram"; break; case MTD_ROM: type = "rom"; break; case MTD_NORFLASH: type = "nor"; break; case MTD_NANDFLASH: type = "nand"; break; case MTD_DATAFLASH: type = "dataflash"; break; case MTD_UBIVOLUME: type = "ubi"; break; case MTD_MLCNANDFLASH: type = "mlc-nand"; break; default: type = "unknown"; } return sysfs_emit(buf, "%s\n", type); } MTD_DEVICE_ATTR_RO(type); static ssize_t mtd_flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "0x%lx\n", (unsigned long)mtd->flags); } MTD_DEVICE_ATTR_RO(flags); static ssize_t mtd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%llu\n", (unsigned long long)mtd->size); } MTD_DEVICE_ATTR_RO(size); static ssize_t mtd_erasesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->erasesize); } MTD_DEVICE_ATTR_RO(erasesize); static ssize_t mtd_writesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->writesize); } MTD_DEVICE_ATTR_RO(writesize); static ssize_t mtd_subpagesize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); unsigned int subpagesize = mtd->writesize >> mtd->subpage_sft; return sysfs_emit(buf, "%u\n", subpagesize); } MTD_DEVICE_ATTR_RO(subpagesize); static ssize_t mtd_oobsize_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%lu\n", (unsigned long)mtd->oobsize); } MTD_DEVICE_ATTR_RO(oobsize); static ssize_t mtd_oobavail_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->oobavail); } MTD_DEVICE_ATTR_RO(oobavail); static ssize_t mtd_numeraseregions_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->numeraseregions); } MTD_DEVICE_ATTR_RO(numeraseregions); static ssize_t mtd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", mtd->name); } MTD_DEVICE_ATTR_RO(name); static ssize_t mtd_ecc_strength_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->ecc_strength); } MTD_DEVICE_ATTR_RO(ecc_strength); static ssize_t mtd_bitflip_threshold_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->bitflip_threshold); } static ssize_t mtd_bitflip_threshold_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct mtd_info *mtd = dev_get_drvdata(dev); unsigned int bitflip_threshold; int retval; retval = kstrtouint(buf, 0, &bitflip_threshold); if (retval) return retval; mtd->bitflip_threshold = bitflip_threshold; return count; } MTD_DEVICE_ATTR_RW(bitflip_threshold); static ssize_t mtd_ecc_step_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); return sysfs_emit(buf, "%u\n", mtd->ecc_step_size); } MTD_DEVICE_ATTR_RO(ecc_step_size); static ssize_t mtd_corrected_bits_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->corrected); } MTD_DEVICE_ATTR_RO(corrected_bits); /* ecc stats corrected */ static ssize_t mtd_ecc_failures_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->failed); } MTD_DEVICE_ATTR_RO(ecc_failures); /* ecc stats errors */ static ssize_t mtd_bad_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->badblocks); } MTD_DEVICE_ATTR_RO(bad_blocks); static ssize_t mtd_bbt_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mtd_info *mtd = dev_get_drvdata(dev); struct mtd_ecc_stats *ecc_stats = &mtd->ecc_stats; return sysfs_emit(buf, "%u\n", ecc_stats->bbtblocks); } MTD_DEVICE_ATTR_RO(bbt_blocks); static struct attribute *mtd_attrs[] = { &dev_attr_type.attr, &dev_attr_flags.attr, &dev_attr_size.attr, &dev_attr_erasesize.attr, &dev_attr_writesize.attr, &dev_attr_subpagesize.attr, &dev_attr_oobsize.attr, &dev_attr_oobavail.attr, &dev_attr_numeraseregions.attr, &dev_attr_name.attr, &dev_attr_ecc_strength.attr, &dev_attr_ecc_step_size.attr, &dev_attr_corrected_bits.attr, &dev_attr_ecc_failures.attr, &dev_attr_bad_blocks.attr, &dev_attr_bbt_blocks.attr, &dev_attr_bitflip_threshold.attr, NULL, }; ATTRIBUTE_GROUPS(mtd); static const struct device_type mtd_devtype = { .name = "mtd", .groups = mtd_groups, .release = mtd_release, }; static bool mtd_expert_analysis_mode; #ifdef CONFIG_DEBUG_FS bool mtd_check_expert_analysis_mode(void) { const char *mtd_expert_analysis_warning = "Bad block checks have been entirely disabled.\n" "This is only reserved for post-mortem forensics and debug purposes.\n" "Never enable this mode if you do not know what you are doing!\n"; return WARN_ONCE(mtd_expert_analysis_mode, mtd_expert_analysis_warning); } EXPORT_SYMBOL_GPL(mtd_check_expert_analysis_mode); #endif static struct dentry *dfs_dir_mtd; static void mtd_debugfs_populate(struct mtd_info *mtd) { struct device *dev = &mtd->dev; if (IS_ERR_OR_NULL(dfs_dir_mtd)) return; mtd->dbg.dfs_dir = debugfs_create_dir(dev_name(dev), dfs_dir_mtd); } #ifndef CONFIG_MMU unsigned mtd_mmap_capabilities(struct mtd_info *mtd) { switch (mtd->type) { case MTD_RAM: return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | NOMMU_MAP_READ | NOMMU_MAP_WRITE; case MTD_ROM: return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC | NOMMU_MAP_READ; default: return NOMMU_MAP_COPY; } } EXPORT_SYMBOL_GPL(mtd_mmap_capabilities); #endif static int mtd_reboot_notifier(struct notifier_block *n, unsigned long state, void *cmd) { struct mtd_info *mtd; mtd = container_of(n, struct mtd_info, reboot_notifier); mtd->_reboot(mtd); return NOTIFY_DONE; } /** * mtd_wunit_to_pairing_info - get pairing information of a wunit * @mtd: pointer to new MTD device info structure * @wunit: write unit we are interested in * @info: returned pairing information * * Retrieve pairing information associated to the wunit. * This is mainly useful when dealing with MLC/TLC NANDs where pages can be * paired together, and where programming a page may influence the page it is * paired with. * The notion of page is replaced by the term wunit (write-unit) to stay * consistent with the ->writesize field. * * The @wunit argument can be extracted from an absolute offset using * mtd_offset_to_wunit(). @info is filled with the pairing information attached * to @wunit. * * From the pairing info the MTD user can find all the wunits paired with * @wunit using the following loop: * * for (i = 0; i < mtd_pairing_groups(mtd); i++) { * info.pair = i; * mtd_pairing_info_to_wunit(mtd, &info); * ... * } */ int mtd_wunit_to_pairing_info(struct mtd_info *mtd, int wunit, struct mtd_pairing_info *info) { struct mtd_info *master = mtd_get_master(mtd); int npairs = mtd_wunit_per_eb(master) / mtd_pairing_groups(master); if (wunit < 0 || wunit >= npairs) return -EINVAL; if (master->pairing && master->pairing->get_info) return master->pairing->get_info(master, wunit, info); info->group = 0; info->pair = wunit; return 0; } EXPORT_SYMBOL_GPL(mtd_wunit_to_pairing_info); /** * mtd_pairing_info_to_wunit - get wunit from pairing information * @mtd: pointer to new MTD device info structure * @info: pairing information struct * * Returns a positive number representing the wunit associated to the info * struct, or a negative error code. * * This is the reverse of mtd_wunit_to_pairing_info(), and can help one to * iterate over all wunits of a given pair (see mtd_wunit_to_pairing_info() * doc). * * It can also be used to only program the first page of each pair (i.e. * page attached to group 0), which allows one to use an MLC NAND in * software-emulated SLC mode: * * info.group = 0; * npairs = mtd_wunit_per_eb(mtd) / mtd_pairing_groups(mtd); * for (info.pair = 0; info.pair < npairs; info.pair++) { * wunit = mtd_pairing_info_to_wunit(mtd, &info); * mtd_write(mtd, mtd_wunit_to_offset(mtd, blkoffs, wunit), * mtd->writesize, &retlen, buf + (i * mtd->writesize)); * } */ int mtd_pairing_info_to_wunit(struct mtd_info *mtd, const struct mtd_pairing_info *info) { struct mtd_info *master = mtd_get_master(mtd); int ngroups = mtd_pairing_groups(master); int npairs = mtd_wunit_per_eb(master) / ngroups; if (!info || info->pair < 0 || info->pair >= npairs || info->group < 0 || info->group >= ngroups) return -EINVAL; if (master->pairing && master->pairing->get_wunit) return mtd->pairing->get_wunit(master, info); return info->pair; } EXPORT_SYMBOL_GPL(mtd_pairing_info_to_wunit); /** * mtd_pairing_groups - get the number of pairing groups * @mtd: pointer to new MTD device info structure * * Returns the number of pairing groups. * * This number is usually equal to the number of bits exposed by a single * cell, and can be used in conjunction with mtd_pairing_info_to_wunit() * to iterate over all pages of a given pair. */ int mtd_pairing_groups(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); if (!master->pairing || !master->pairing->ngroups) return 1; return master->pairing->ngroups; } EXPORT_SYMBOL_GPL(mtd_pairing_groups); static int mtd_nvmem_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int err; err = mtd_read(mtd, offset, bytes, &retlen, val); if (err && err != -EUCLEAN) return err; return retlen == bytes ? 0 : -EIO; } static int mtd_nvmem_add(struct mtd_info *mtd) { struct device_node *node = mtd_get_of_node(mtd); struct nvmem_config config = {}; config.id = NVMEM_DEVID_NONE; config.dev = &mtd->dev; config.name = dev_name(&mtd->dev); config.owner = THIS_MODULE; config.add_legacy_fixed_of_cells = of_device_is_compatible(node, "nvmem-cells"); config.reg_read = mtd_nvmem_reg_read; config.size = mtd->size; config.word_size = 1; config.stride = 1; config.read_only = true; config.root_only = true; config.ignore_wp = true; config.priv = mtd; mtd->nvmem = nvmem_register(&config); if (IS_ERR(mtd->nvmem)) { /* Just ignore if there is no NVMEM support in the kernel */ if (PTR_ERR(mtd->nvmem) == -EOPNOTSUPP) mtd->nvmem = NULL; else return dev_err_probe(&mtd->dev, PTR_ERR(mtd->nvmem), "Failed to register NVMEM device\n"); } return 0; } static void mtd_check_of_node(struct mtd_info *mtd) { struct device_node *partitions, *parent_dn, *mtd_dn = NULL; const char *pname, *prefix = "partition-"; int plen, mtd_name_len, offset, prefix_len; /* Check if MTD already has a device node */ if (mtd_get_of_node(mtd)) return; if (!mtd_is_partition(mtd)) return; parent_dn = of_node_get(mtd_get_of_node(mtd->parent)); if (!parent_dn) return; if (mtd_is_partition(mtd->parent)) partitions = of_node_get(parent_dn); else partitions = of_get_child_by_name(parent_dn, "partitions"); if (!partitions) goto exit_parent; prefix_len = strlen(prefix); mtd_name_len = strlen(mtd->name); /* Search if a partition is defined with the same name */ for_each_child_of_node(partitions, mtd_dn) { /* Skip partition with no/wrong prefix */ if (!of_node_name_prefix(mtd_dn, prefix)) continue; /* Label have priority. Check that first */ if (!of_property_read_string(mtd_dn, "label", &pname)) { offset = 0; } else { pname = mtd_dn->name; offset = prefix_len; } plen = strlen(pname) - offset; if (plen == mtd_name_len && !strncmp(mtd->name, pname + offset, plen)) { mtd_set_of_node(mtd, mtd_dn); of_node_put(mtd_dn); break; } } of_node_put(partitions); exit_parent: of_node_put(parent_dn); } /** * add_mtd_device - register an MTD device * @mtd: pointer to new MTD device info structure * * Add a device to the list of MTD devices present in the system, and * notify each currently active MTD 'user' of its arrival. Returns * zero on success or non-zero on failure. */ int add_mtd_device(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); struct mtd_info *master = mtd_get_master(mtd); struct mtd_notifier *not; int i, error, ofidx; /* * May occur, for instance, on buggy drivers which call * mtd_device_parse_register() multiple times on the same master MTD, * especially with CONFIG_MTD_PARTITIONED_MASTER=y. */ if (WARN_ONCE(mtd->dev.type, "MTD already registered\n")) return -EEXIST; BUG_ON(mtd->writesize == 0); /* * MTD drivers should implement ->_{write,read}() or * ->_{write,read}_oob(), but not both. */ if (WARN_ON((mtd->_write && mtd->_write_oob) || (mtd->_read && mtd->_read_oob))) return -EINVAL; if (WARN_ON((!mtd->erasesize || !master->_erase) && !(mtd->flags & MTD_NO_ERASE))) return -EINVAL; /* * MTD_SLC_ON_MLC_EMULATION can only be set on partitions, when the * master is an MLC NAND and has a proper pairing scheme defined. * We also reject masters that implement ->_writev() for now, because * NAND controller drivers don't implement this hook, and adding the * SLC -> MLC address/length conversion to this path is useless if we * don't have a user. */ if (mtd->flags & MTD_SLC_ON_MLC_EMULATION && (!mtd_is_partition(mtd) || master->type != MTD_MLCNANDFLASH || !master->pairing || master->_writev)) return -EINVAL; mutex_lock(&mtd_table_mutex); ofidx = -1; if (np) ofidx = of_alias_get_id(np, "mtd"); if (ofidx >= 0) i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); else i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); if (i < 0) { error = i; goto fail_locked; } mtd->index = i; kref_init(&mtd->refcnt); /* default value if not set by driver */ if (mtd->bitflip_threshold == 0) mtd->bitflip_threshold = mtd->ecc_strength; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { int ngroups = mtd_pairing_groups(master); mtd->erasesize /= ngroups; mtd->size = (u64)mtd_div_by_eb(mtd->size, master) * mtd->erasesize; } if (is_power_of_2(mtd->erasesize)) mtd->erasesize_shift = ffs(mtd->erasesize) - 1; else mtd->erasesize_shift = 0; if (is_power_of_2(mtd->writesize)) mtd->writesize_shift = ffs(mtd->writesize) - 1; else mtd->writesize_shift = 0; mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1; mtd->writesize_mask = (1 << mtd->writesize_shift) - 1; /* Some chips always power up locked. Unlock them now */ if ((mtd->flags & MTD_WRITEABLE) && (mtd->flags & MTD_POWERUP_LOCK)) { error = mtd_unlock(mtd, 0, mtd->size); if (error && error != -EOPNOTSUPP) printk(KERN_WARNING "%s: unlock failed, writes may not work\n", mtd->name); /* Ignore unlock failures? */ error = 0; } /* Caller should have set dev.parent to match the * physical device, if appropriate. */ mtd->dev.type = &mtd_devtype; mtd->dev.class = &mtd_class; mtd->dev.devt = MTD_DEVT(i); error = dev_set_name(&mtd->dev, "mtd%d", i); if (error) goto fail_devname; dev_set_drvdata(&mtd->dev, mtd); mtd_check_of_node(mtd); of_node_get(mtd_get_of_node(mtd)); error = device_register(&mtd->dev); if (error) { put_device(&mtd->dev); goto fail_added; } /* Add the nvmem provider */ error = mtd_nvmem_add(mtd); if (error) goto fail_nvmem_add; mtd_debugfs_populate(mtd); device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL, "mtd%dro", i); pr_debug("mtd: Giving out device %d to %s\n", i, mtd->name); /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) not->add(mtd); mutex_unlock(&mtd_table_mutex); if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) { if (IS_BUILTIN(CONFIG_MTD)) { pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name); ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index); } else { pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n", mtd->index, mtd->name); } } /* We _know_ we aren't being removed, because our caller is still holding us here. So none of this try_ nonsense, and no bitching about it either. :) */ __module_get(THIS_MODULE); return 0; fail_nvmem_add: device_unregister(&mtd->dev); fail_added: of_node_put(mtd_get_of_node(mtd)); fail_devname: idr_remove(&mtd_idr, i); fail_locked: mutex_unlock(&mtd_table_mutex); return error; } /** * del_mtd_device - unregister an MTD device * @mtd: pointer to MTD device info structure * * Remove a device from the list of MTD devices present in the system, * and notify each currently active MTD 'user' of its departure. * Returns zero on success or 1 on failure, which currently will happen * if the requested device does not appear to be present in the list. */ int del_mtd_device(struct mtd_info *mtd) { int ret; struct mtd_notifier *not; mutex_lock(&mtd_table_mutex); if (idr_find(&mtd_idr, mtd->index) != mtd) { ret = -ENODEV; goto out_error; } /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) not->remove(mtd); kref_put(&mtd->refcnt, mtd_device_release); ret = 0; out_error: mutex_unlock(&mtd_table_mutex); return ret; } /* * Set a few defaults based on the parent devices, if not provided by the * driver */ static void mtd_set_dev_defaults(struct mtd_info *mtd) { if (mtd->dev.parent) { if (!mtd->owner && mtd->dev.parent->driver) mtd->owner = mtd->dev.parent->driver->owner; if (!mtd->name) mtd->name = dev_name(mtd->dev.parent); } else { pr_debug("mtd device won't show a device symlink in sysfs\n"); } INIT_LIST_HEAD(&mtd->partitions); mutex_init(&mtd->master.partitions_lock); mutex_init(&mtd->master.chrdev_lock); } static ssize_t mtd_otp_size(struct mtd_info *mtd, bool is_user) { struct otp_info *info; ssize_t size = 0; unsigned int i; size_t retlen; int ret; info = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!info) return -ENOMEM; if (is_user) ret = mtd_get_user_prot_info(mtd, PAGE_SIZE, &retlen, info); else ret = mtd_get_fact_prot_info(mtd, PAGE_SIZE, &retlen, info); if (ret) goto err; for (i = 0; i < retlen / sizeof(*info); i++) size += info[i].length; kfree(info); return size; err: kfree(info); /* ENODATA means there is no OTP region. */ return ret == -ENODATA ? 0 : ret; } static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd, const char *compatible, int size, nvmem_reg_read_t reg_read) { struct nvmem_device *nvmem = NULL; struct nvmem_config config = {}; struct device_node *np; /* DT binding is optional */ np = of_get_compatible_child(mtd->dev.of_node, compatible); /* OTP nvmem will be registered on the physical device */ config.dev = mtd->dev.parent; config.name = compatible; config.id = NVMEM_DEVID_AUTO; config.owner = THIS_MODULE; config.add_legacy_fixed_of_cells = !mtd_type_is_nand(mtd); config.type = NVMEM_TYPE_OTP; config.root_only = true; config.ignore_wp = true; config.reg_read = reg_read; config.size = size; config.of_node = np; config.priv = mtd; nvmem = nvmem_register(&config); /* Just ignore if there is no NVMEM support in the kernel */ if (IS_ERR(nvmem) && PTR_ERR(nvmem) == -EOPNOTSUPP) nvmem = NULL; of_node_put(np); return nvmem; } static int mtd_nvmem_user_otp_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int ret; ret = mtd_read_user_prot_reg(mtd, offset, bytes, &retlen, val); if (ret) return ret; return retlen == bytes ? 0 : -EIO; } static int mtd_nvmem_fact_otp_reg_read(void *priv, unsigned int offset, void *val, size_t bytes) { struct mtd_info *mtd = priv; size_t retlen; int ret; ret = mtd_read_fact_prot_reg(mtd, offset, bytes, &retlen, val); if (ret) return ret; return retlen == bytes ? 0 : -EIO; } static int mtd_otp_nvmem_add(struct mtd_info *mtd) { struct device *dev = mtd->dev.parent; struct nvmem_device *nvmem; ssize_t size; int err; if (mtd->_get_user_prot_info && mtd->_read_user_prot_reg) { size = mtd_otp_size(mtd, true); if (size < 0) { err = size; goto err; } if (size > 0) { nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size, mtd_nvmem_user_otp_reg_read); if (IS_ERR(nvmem)) { err = PTR_ERR(nvmem); goto err; } mtd->otp_user_nvmem = nvmem; } } if (mtd->_get_fact_prot_info && mtd->_read_fact_prot_reg) { size = mtd_otp_size(mtd, false); if (size < 0) { err = size; goto err; } if (size > 0) { /* * The factory OTP contains thing such as a unique serial * number and is small, so let's read it out and put it * into the entropy pool. */ void *otp; otp = kmalloc(size, GFP_KERNEL); if (!otp) { err = -ENOMEM; goto err; } err = mtd_nvmem_fact_otp_reg_read(mtd, 0, otp, size); if (err < 0) { kfree(otp); goto err; } add_device_randomness(otp, err); kfree(otp); nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size, mtd_nvmem_fact_otp_reg_read); if (IS_ERR(nvmem)) { err = PTR_ERR(nvmem); goto err; } mtd->otp_factory_nvmem = nvmem; } } return 0; err: nvmem_unregister(mtd->otp_user_nvmem); /* Don't report error if OTP is not supported. */ if (err == -EOPNOTSUPP) return 0; return dev_err_probe(dev, err, "Failed to register OTP NVMEM device\n"); } /** * mtd_device_parse_register - parse partitions and register an MTD device. * * @mtd: the MTD device to register * @types: the list of MTD partition probes to try, see * 'parse_mtd_partitions()' for more information * @parser_data: MTD partition parser-specific data * @parts: fallback partition information to register, if parsing fails; * only valid if %nr_parts > %0 * @nr_parts: the number of partitions in parts, if zero then the full * MTD device is registered if no partition info is found * * This function aggregates MTD partitions parsing (done by * 'parse_mtd_partitions()') and MTD device and partitions registering. It * basically follows the most common pattern found in many MTD drivers: * * * If the MTD_PARTITIONED_MASTER option is set, then the device as a whole is * registered first. * * Then It tries to probe partitions on MTD device @mtd using parsers * specified in @types (if @types is %NULL, then the default list of parsers * is used, see 'parse_mtd_partitions()' for more information). If none are * found this functions tries to fallback to information specified in * @parts/@nr_parts. * * If no partitions were found this function just registers the MTD device * @mtd and exits. * * Returns zero in case of success and a negative error code in case of failure. */ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, struct mtd_part_parser_data *parser_data, const struct mtd_partition *parts, int nr_parts) { int ret, err; mtd_set_dev_defaults(mtd); ret = mtd_otp_nvmem_add(mtd); if (ret) goto out; if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) { ret = add_mtd_device(mtd); if (ret) goto out; } /* Prefer parsed partitions over driver-provided fallback */ ret = parse_mtd_partitions(mtd, types, parser_data); if (ret == -EPROBE_DEFER) goto out; if (ret > 0) ret = 0; else if (nr_parts) ret = add_mtd_partitions(mtd, parts, nr_parts); else if (!device_is_registered(&mtd->dev)) ret = add_mtd_device(mtd); else ret = 0; if (ret) goto out; /* * FIXME: some drivers unfortunately call this function more than once. * So we have to check if we've already assigned the reboot notifier. * * Generally, we can make multiple calls work for most cases, but it * does cause problems with parse_mtd_partitions() above (e.g., * cmdlineparts will register partitions more than once). */ WARN_ONCE(mtd->_reboot && mtd->reboot_notifier.notifier_call, "MTD already registered\n"); if (mtd->_reboot && !mtd->reboot_notifier.notifier_call) { mtd->reboot_notifier.notifier_call = mtd_reboot_notifier; register_reboot_notifier(&mtd->reboot_notifier); } out: if (ret) { nvmem_unregister(mtd->otp_user_nvmem); nvmem_unregister(mtd->otp_factory_nvmem); } if (ret && device_is_registered(&mtd->dev)) { err = del_mtd_device(mtd); if (err) pr_err("Error when deleting MTD device (%d)\n", err); } return ret; } EXPORT_SYMBOL_GPL(mtd_device_parse_register); /** * mtd_device_unregister - unregister an existing MTD device. * * @master: the MTD device to unregister. This will unregister both the master * and any partitions if registered. */ int mtd_device_unregister(struct mtd_info *master) { int err; if (master->_reboot) { unregister_reboot_notifier(&master->reboot_notifier); memset(&master->reboot_notifier, 0, sizeof(master->reboot_notifier)); } nvmem_unregister(master->otp_user_nvmem); nvmem_unregister(master->otp_factory_nvmem); err = del_mtd_partitions(master); if (err) return err; if (!device_is_registered(&master->dev)) return 0; return del_mtd_device(master); } EXPORT_SYMBOL_GPL(mtd_device_unregister); /** * register_mtd_user - register a 'user' of MTD devices. * @new: pointer to notifier info structure * * Registers a pair of callbacks function to be called upon addition * or removal of MTD devices. Causes the 'add' callback to be immediately * invoked for each MTD device currently present in the system. */ void register_mtd_user (struct mtd_notifier *new) { struct mtd_info *mtd; mutex_lock(&mtd_table_mutex); list_add(&new->list, &mtd_notifiers); __module_get(THIS_MODULE); mtd_for_each_device(mtd) new->add(mtd); mutex_unlock(&mtd_table_mutex); } EXPORT_SYMBOL_GPL(register_mtd_user); /** * unregister_mtd_user - unregister a 'user' of MTD devices. * @old: pointer to notifier info structure * * Removes a callback function pair from the list of 'users' to be * notified upon addition or removal of MTD devices. Causes the * 'remove' callback to be immediately invoked for each MTD device * currently present in the system. */ int unregister_mtd_user (struct mtd_notifier *old) { struct mtd_info *mtd; mutex_lock(&mtd_table_mutex); module_put(THIS_MODULE); mtd_for_each_device(mtd) old->remove(mtd); list_del(&old->list); mutex_unlock(&mtd_table_mutex); return 0; } EXPORT_SYMBOL_GPL(unregister_mtd_user); /** * get_mtd_device - obtain a validated handle for an MTD device * @mtd: last known address of the required MTD device * @num: internal device number of the required MTD device * * Given a number and NULL address, return the num'th entry in the device * table, if any. Given an address and num == -1, search the device table * for a device with that address and return if it's still present. Given * both, return the num'th driver only if its address matches. Return * error code if not. */ struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num) { struct mtd_info *ret = NULL, *other; int err = -ENODEV; mutex_lock(&mtd_table_mutex); if (num == -1) { mtd_for_each_device(other) { if (other == mtd) { ret = mtd; break; } } } else if (num >= 0) { ret = idr_find(&mtd_idr, num); if (mtd && mtd != ret) ret = NULL; } if (!ret) { ret = ERR_PTR(err); goto out; } err = __get_mtd_device(ret); if (err) ret = ERR_PTR(err); out: mutex_unlock(&mtd_table_mutex); return ret; } EXPORT_SYMBOL_GPL(get_mtd_device); int __get_mtd_device(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); int err; if (master->_get_device) { err = master->_get_device(mtd); if (err) return err; } if (!try_module_get(master->owner)) { if (master->_put_device) master->_put_device(master); return -ENODEV; } while (mtd) { if (mtd != master) kref_get(&mtd->refcnt); mtd = mtd->parent; } if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) kref_get(&master->refcnt); return 0; } EXPORT_SYMBOL_GPL(__get_mtd_device); /** * of_get_mtd_device_by_node - obtain an MTD device associated with a given node * * @np: device tree node */ struct mtd_info *of_get_mtd_device_by_node(struct device_node *np) { struct mtd_info *mtd = NULL; struct mtd_info *tmp; int err; mutex_lock(&mtd_table_mutex); err = -EPROBE_DEFER; mtd_for_each_device(tmp) { if (mtd_get_of_node(tmp) == np) { mtd = tmp; err = __get_mtd_device(mtd); break; } } mutex_unlock(&mtd_table_mutex); return err ? ERR_PTR(err) : mtd; } EXPORT_SYMBOL_GPL(of_get_mtd_device_by_node); /** * get_mtd_device_nm - obtain a validated handle for an MTD device by * device name * @name: MTD device name to open * * This function returns MTD device description structure in case of * success and an error code in case of failure. */ struct mtd_info *get_mtd_device_nm(const char *name) { int err = -ENODEV; struct mtd_info *mtd = NULL, *other; mutex_lock(&mtd_table_mutex); mtd_for_each_device(other) { if (!strcmp(name, other->name)) { mtd = other; break; } } if (!mtd) goto out_unlock; err = __get_mtd_device(mtd); if (err) goto out_unlock; mutex_unlock(&mtd_table_mutex); return mtd; out_unlock: mutex_unlock(&mtd_table_mutex); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(get_mtd_device_nm); void put_mtd_device(struct mtd_info *mtd) { mutex_lock(&mtd_table_mutex); __put_mtd_device(mtd); mutex_unlock(&mtd_table_mutex); } EXPORT_SYMBOL_GPL(put_mtd_device); void __put_mtd_device(struct mtd_info *mtd) { struct mtd_info *master = mtd_get_master(mtd); while (mtd) { /* kref_put() can relese mtd, so keep a reference mtd->parent */ struct mtd_info *parent = mtd->parent; if (mtd != master) kref_put(&mtd->refcnt, mtd_device_release); mtd = parent; } if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) kref_put(&master->refcnt, mtd_device_release); module_put(master->owner); /* must be the last as master can be freed in the _put_device */ if (master->_put_device) master->_put_device(master); } EXPORT_SYMBOL_GPL(__put_mtd_device); /* * Erase is an synchronous operation. Device drivers are epected to return a * negative error code if the operation failed and update instr->fail_addr * to point the portion that was not properly erased. */ int mtd_erase(struct mtd_info *mtd, struct erase_info *instr) { struct mtd_info *master = mtd_get_master(mtd); u64 mst_ofs = mtd_get_master_ofs(mtd, 0); struct erase_info adjinstr; int ret; instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN; adjinstr = *instr; if (!mtd->erasesize || !master->_erase) return -ENOTSUPP; if (instr->addr >= mtd->size || instr->len > mtd->size - instr->addr) return -EINVAL; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; if (!instr->len) return 0; ledtrig_mtd_activity(); if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { adjinstr.addr = (loff_t)mtd_div_by_eb(instr->addr, mtd) * master->erasesize; adjinstr.len = ((u64)mtd_div_by_eb(instr->addr + instr->len, mtd) * master->erasesize) - adjinstr.addr; } adjinstr.addr += mst_ofs; ret = master->_erase(master, &adjinstr); if (adjinstr.fail_addr != MTD_FAIL_ADDR_UNKNOWN) { instr->fail_addr = adjinstr.fail_addr - mst_ofs; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) { instr->fail_addr = mtd_div_by_eb(instr->fail_addr, master); instr->fail_addr *= mtd->erasesize; } } return ret; } EXPORT_SYMBOL_GPL(mtd_erase); ALLOW_ERROR_INJECTION(mtd_erase, ERRNO); /* * This stuff for eXecute-In-Place. phys is optional and may be set to NULL. */ int mtd_point(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, void **virt, resource_size_t *phys) { struct mtd_info *master = mtd_get_master(mtd); *retlen = 0; *virt = NULL; if (phys) *phys = 0; if (!master->_point) return -EOPNOTSUPP; if (from < 0 || from >= mtd->size || len > mtd->size - from) return -EINVAL; if (!len) return 0; from = mtd_get_master_ofs(mtd, from); return master->_point(master, from, len, retlen, virt, phys); } EXPORT_SYMBOL_GPL(mtd_point); /* We probably shouldn't allow XIP if the unpoint isn't a NULL */ int mtd_unpoint(struct mtd_info *mtd, loff_t from, size_t len) { struct mtd_info *master = mtd_get_master(mtd); if (!master->_unpoint) return -EOPNOTSUPP; if (from < 0 || from >= mtd->size || len > mtd->size - from) return -EINVAL; if (!len) return 0; return master->_unpoint(master, mtd_get_master_ofs(mtd, from), len); } EXPORT_SYMBOL_GPL(mtd_unpoint); /* * Allow NOMMU mmap() to directly map the device (if not NULL) * - return the address to which the offset maps * - return -ENOSYS to indicate refusal to do the mapping */ unsigned long mtd_get_unmapped_area(struct mtd_info *mtd, unsigned long len, unsigned long offset, unsigned long flags) { size_t retlen; void *virt; int ret; ret = mtd_point(mtd, offset, len, &retlen, &virt, NULL); if (ret) return ret; if (retlen != len) { mtd_unpoint(mtd, offset, retlen); return -ENOSYS; } return (unsigned long)virt; } EXPORT_SYMBOL_GPL(mtd_get_unmapped_area); static void mtd_update_ecc_stats(struct mtd_info *mtd, struct mtd_info *master, const struct mtd_ecc_stats *old_stats) { struct mtd_ecc_stats diff; if (master == mtd) return; diff = master->ecc_stats; diff.failed -= old_stats->failed; diff.corrected -= old_stats->corrected; while (mtd->parent) { mtd->ecc_stats.failed += diff.failed; mtd->ecc_stats.corrected += diff.corrected; mtd = mtd->parent; } } int mtd_read(struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf) { struct mtd_oob_ops ops = { .len = len, .datbuf = buf, }; int ret; ret = mtd_read_oob(mtd, from, &ops); *retlen = ops.retlen; WARN_ON_ONCE(*retlen != len && mtd_is_bitflip_or_eccerr(ret)); return ret; } EXPORT_SYMBOL_GPL(mtd_read); ALLOW_ERROR_INJECTION(mtd_read, ERRNO); int mtd_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct mtd_oob_ops ops = { .len = len, .datbuf = (u8 *)buf, }; int ret; ret = mtd_write_oob(mtd, to, &ops); *retlen = ops.retlen; return ret; } EXPORT_SYMBOL_GPL(mtd_write); ALLOW_ERROR_INJECTION(mtd_write, ERRNO); /* * In blackbox flight recorder like scenarios we want to make successful writes * in interrupt context. panic_write() is only intended to be called when its * known the kernel is about to panic and we need the write to succeed. Since * the kernel is not going to be running for much longer, this function can * break locks and delay to ensure the write succeeds (but not sleep). */ int mtd_panic_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen, const u_char *buf) { struct mtd_info *master = mtd_get_master(mtd); *retlen = 0; if (!master->_panic_write) return -EOPNOTSUPP; if (to < 0 || to >= mtd->size || len > mtd->size - to) return -EINVAL; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; if (!len) return 0; if (!master->oops_panic_write) master->oops_panic_write = true; return master->_panic_write(master, mtd_get_master_ofs(mtd, to), len, retlen, buf); } EXPORT_SYMBOL_GPL(mtd_panic_write); static int mtd_check_oob_ops(struct mtd_info *mtd, loff_t offs, struct mtd_oob_ops *ops) { /* * Some users are setting ->datbuf or ->oobbuf to NULL, but are leaving * ->len or ->ooblen uninitialized. Force ->len and ->ooblen to 0 in * this case. */ if (!ops->datbuf) ops->len = 0; if (!ops->oobbuf) ops->ooblen = 0; if (offs < 0 || offs + ops->len > mtd->size) return -EINVAL; if (ops->ooblen) { size_t maxooblen; if (ops->ooboffs >= mtd_oobavail(mtd, ops)) return -EINVAL; maxooblen = ((size_t)(mtd_div_by_ws(mtd->size, mtd) - mtd_div_by_ws(offs, mtd)) * mtd_oobavail(mtd, ops)) - ops->ooboffs; if (ops->ooblen > maxooblen) return -EINVAL; } return 0; } static int mtd_read_oob_std(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ret; from = mtd_get_master_ofs(mtd, from); if (master->_read_oob) ret = master->_read_oob(master, from, ops); else ret = master->_read(master, from, ops->len, &ops->retlen, ops->datbuf); return ret; } static int mtd_write_oob_std(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ret; to = mtd_get_master_ofs(mtd, to); if (master->_write_oob) ret = master->_write_oob(master, to, ops); else ret = master->_write(master, to, ops->len, &ops->retlen, ops->datbuf); return ret; } static int mtd_io_emulated_slc(struct mtd_info *mtd, loff_t start, bool read, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ngroups = mtd_pairing_groups(master); int npairs = mtd_wunit_per_eb(master) / ngroups; struct mtd_oob_ops adjops = *ops; unsigned int wunit, oobavail; struct mtd_pairing_info info; int max_bitflips = 0; u32 ebofs, pageofs; loff_t base, pos; ebofs = mtd_mod_by_eb(start, mtd); base = (loff_t)mtd_div_by_eb(start, mtd) * master->erasesize; info.group = 0; info.pair = mtd_div_by_ws(ebofs, mtd); pageofs = mtd_mod_by_ws(ebofs, mtd); oobavail = mtd_oobavail(mtd, ops); while (ops->retlen < ops->len || ops->oobretlen < ops->ooblen) { int ret; if (info.pair >= npairs) { info.pair = 0; base += master->erasesize; } wunit = mtd_pairing_info_to_wunit(master, &info); pos = mtd_wunit_to_offset(mtd, base, wunit); adjops.len = ops->len - ops->retlen; if (adjops.len > mtd->writesize - pageofs) adjops.len = mtd->writesize - pageofs; adjops.ooblen = ops->ooblen - ops->oobretlen; if (adjops.ooblen > oobavail - adjops.ooboffs) adjops.ooblen = oobavail - adjops.ooboffs; if (read) { ret = mtd_read_oob_std(mtd, pos + pageofs, &adjops); if (ret > 0) max_bitflips = max(max_bitflips, ret); } else { ret = mtd_write_oob_std(mtd, pos + pageofs, &adjops); } if (ret < 0) return ret; max_bitflips = max(max_bitflips, ret); ops->retlen += adjops.retlen; ops->oobretlen += adjops.oobretlen; adjops.datbuf += adjops.retlen; adjops.oobbuf += adjops.oobretlen; adjops.ooboffs = 0; pageofs = 0; info.pair++; } return max_bitflips; } int mtd_read_oob(struct mtd_info *mtd, loff_t from, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); struct mtd_ecc_stats old_stats = master->ecc_stats; int ret_code; ops->retlen = ops->oobretlen = 0; ret_code = mtd_check_oob_ops(mtd, from, ops); if (ret_code) return ret_code; ledtrig_mtd_activity(); /* Check the validity of a potential fallback on mtd->_read */ if (!master->_read_oob && (!master->_read || ops->oobbuf)) return -EOPNOTSUPP; if (ops->stats) memset(ops->stats, 0, sizeof(*ops->stats)); if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) ret_code = mtd_io_emulated_slc(mtd, from, true, ops); else ret_code = mtd_read_oob_std(mtd, from, ops); mtd_update_ecc_stats(mtd, master, &old_stats); /* * In cases where ops->datbuf != NULL, mtd->_read_oob() has semantics * similar to mtd->_read(), returning a non-negative integer * representing max bitflips. In other cases, mtd->_read_oob() may * return -EUCLEAN. In all cases, perform similar logic to mtd_read(). */ if (unlikely(ret_code < 0)) return ret_code; if (mtd->ecc_strength == 0) return 0; /* device lacks ecc */ if (ops->stats) ops->stats->max_bitflips = ret_code; return ret_code >= mtd->bitflip_threshold ? -EUCLEAN : 0; } EXPORT_SYMBOL_GPL(mtd_read_oob); int mtd_write_oob(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { struct mtd_info *master = mtd_get_master(mtd); int ret; ops->retlen = ops->oobretlen = 0; if (!(mtd->flags & MTD_WRITEABLE)) return -EROFS; ret = mtd_check_oob_ops(mtd, to, ops); if (ret) return ret; ledtrig_mtd_activity(); /* Check the validity of a potential fallback on mtd->_write */ if (!master->_write_oob && (!master->_write || ops->oobbuf)) return -EOPNOTSUPP; if (mtd->flags & MTD_SLC_ON_MLC_EMULATION) return mtd_io_emulated_slc(mtd, to, false, ops); return mtd_write_oob_std(mtd, to, ops); } EXPORT_SYMBOL_GPL(mtd_write_oob); /** * mtd_ooblayout_ecc - Get the OOB region definition of a specific ECC section * @mtd: MTD device structure * @section: ECC section. Depending on the layout you may have all the ECC * bytes stored in a single contiguous section, or one section * per ECC chunk (and sometime several sections for a single ECC * ECC chunk) * @oobecc: OOB region struct filled with the appropriate ECC position * information * * This function returns ECC section information in the OOB area. If you want * to get all the ECC bytes information, then you should call * mtd_ooblayout_ecc(mtd, section++, oobecc) until it returns -ERANGE. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_ecc(struct mtd_info *mtd, int section, struct mtd_oob_region *oobecc) { struct mtd_info *master = mtd_get_master(mtd); memset(oobecc, 0, sizeof(*oobecc)); if (!master || section < 0) return -EINVAL; if (!master->ooblayout || !master->ooblayout->ecc) return -ENOTSUPP; return master->ooblayout->ecc(master, section, oobecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_ecc); /** * mtd_ooblayout_free - Get the OOB region definition of a specific free * section * @mtd: MTD device structure * @section: Free section you are interested in. Depending on the layout * you may have all the free bytes stored in a single contiguous * section, or one section per ECC chunk plus an extra section * for the remaining bytes (or other funky layout). * @oobfree: OOB region struct filled with the appropriate free position * information * * This function returns free bytes position in the OOB area. If you want * to get all the free bytes information, then you should call * mtd_ooblayout_free(mtd, section++, oobfree) until it returns -ERANGE. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_free(struct mtd_info *mtd, int section, struct mtd_oob_region *oobfree) { struct mtd_info *master = mtd_get_master(mtd); memset(oobfree, 0, sizeof(*oobfree)); if (!master || section < 0) return -EINVAL; if (!master->ooblayout || !master->ooblayout->free) return -ENOTSUPP; return master->ooblayout->free(master, section, oobfree); } EXPORT_SYMBOL_GPL(mtd_ooblayout_free); /** * mtd_ooblayout_find_region - Find the region attached to a specific byte * @mtd: mtd info structure * @byte: the byte we are searching for * @sectionp: pointer where the section id will be stored * @oobregion: used to retrieve the ECC position * @iter: iterator function. Should be either mtd_ooblayout_free or * mtd_ooblayout_ecc depending on the region type you're searching for * * This function returns the section id and oobregion information of a * specific byte. For example, say you want to know where the 4th ECC byte is * stored, you'll use: * * mtd_ooblayout_find_region(mtd, 3, &section, &oobregion, mtd_ooblayout_ecc); * * Returns zero on success, a negative error code otherwise. */ static int mtd_ooblayout_find_region(struct mtd_info *mtd, int byte, int *sectionp, struct mtd_oob_region *oobregion, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { int pos = 0, ret, section = 0; memset(oobregion, 0, sizeof(*oobregion)); while (1) { ret = iter(mtd, section, oobregion); if (ret) return ret; if (pos + oobregion->length > byte) break; pos += oobregion->length; section++; } /* * Adjust region info to make it start at the beginning at the * 'start' ECC byte. */ oobregion->offset += byte - pos; oobregion->length -= byte - pos; *sectionp = section; return 0; } /** * mtd_ooblayout_find_eccregion - Find the ECC region attached to a specific * ECC byte * @mtd: mtd info structure * @eccbyte: the byte we are searching for * @section: pointer where the section id will be stored * @oobregion: OOB region information * * Works like mtd_ooblayout_find_region() except it searches for a specific ECC * byte. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_find_eccregion(struct mtd_info *mtd, int eccbyte, int *section, struct mtd_oob_region *oobregion) { return mtd_ooblayout_find_region(mtd, eccbyte, section, oobregion, mtd_ooblayout_ecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_find_eccregion); /** * mtd_ooblayout_get_bytes - Extract OOB bytes from the oob buffer * @mtd: mtd info structure * @buf: destination buffer to store OOB bytes * @oobbuf: OOB buffer * @start: first byte to retrieve * @nbytes: number of bytes to retrieve * @iter: section iterator * * Extract bytes attached to a specific category (ECC or free) * from the OOB buffer and copy them into buf. * * Returns zero on success, a negative error code otherwise. */ static int mtd_ooblayout_get_bytes(struct mtd_info *mtd, u8 *buf, const u8 *oobbuf, int start, int nbytes, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { struct mtd_oob_region oobregion; int section, ret; ret = mtd_ooblayout_find_region(mtd, start, &section, &oobregion, iter); while (!ret) { int cnt; cnt = min_t(int, nbytes, oobregion.length); memcpy(buf, oobbuf + oobregion.offset, cnt); buf += cnt; nbytes -= cnt; if (!nbytes) break; ret = iter(mtd, ++section, &oobregion); } return ret; } /** * mtd_ooblayout_set_bytes - put OOB bytes into the oob buffer * @mtd: mtd info structure * @buf: source buffer to get OOB bytes from * @oobbuf: OOB buffer * @start: first OOB byte to set * @nbytes: number of OOB bytes to set * @iter: section iterator * * Fill the OOB buffer with data provided in buf. The category (ECC or free) * is selected by passing the appropriate iterator. * * Returns zero on success, a negative error code otherwise. */ static int mtd_ooblayout_set_bytes(struct mtd_info *mtd, const u8 *buf, u8 *oobbuf, int start, int nbytes, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { struct mtd_oob_region oobregion; int section, ret; ret = mtd_ooblayout_find_region(mtd, start, &section, &oobregion, iter); while (!ret) { int cnt; cnt = min_t(int, nbytes, oobregion.length); memcpy(oobbuf + oobregion.offset, buf, cnt); buf += cnt; nbytes -= cnt; if (!nbytes) break; ret = iter(mtd, ++section, &oobregion); } return ret; } /** * mtd_ooblayout_count_bytes - count the number of bytes in a OOB category * @mtd: mtd info structure * @iter: category iterator * * Count the number of bytes in a given category. * * Returns a positive value on success, a negative error code otherwise. */ static int mtd_ooblayout_count_bytes(struct mtd_info *mtd, int (*iter)(struct mtd_info *, int section, struct mtd_oob_region *oobregion)) { struct mtd_oob_region oobregion; int section = 0, ret, nbytes = 0; while (1) { ret = iter(mtd, section++, &oobregion); if (ret) { if (ret == -ERANGE) ret = nbytes; break; } nbytes += oobregion.length; } return ret; } /** * mtd_ooblayout_get_eccbytes - extract ECC bytes from the oob buffer * @mtd: mtd info structure * @eccbuf: destination buffer to store ECC bytes * @oobbuf: OOB buffer * @start: first ECC byte to retrieve * @nbytes: number of ECC bytes to retrieve * * Works like mtd_ooblayout_get_bytes(), except it acts on ECC bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_get_eccbytes(struct mtd_info *mtd, u8 *eccbuf, const u8 *oobbuf, int start, int nbytes) { return mtd_ooblayout_get_bytes(mtd, eccbuf, oobbuf, start, nbytes, mtd_ooblayout_ecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_get_eccbytes); /** * mtd_ooblayout_set_eccbytes - set ECC bytes into the oob buffer * @mtd: mtd info structure * @eccbuf: source buffer to get ECC bytes from * @oobbuf: OOB buffer * @start: first ECC byte to set * @nbytes: number of ECC bytes to set * * Works like mtd_ooblayout_set_bytes(), except it acts on ECC bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_set_eccbytes(struct mtd_info *mtd, const u8 *eccbuf, u8 *oobbuf, int start, int nbytes) { return mtd_ooblayout_set_bytes(mtd, eccbuf, oobbuf, start, nbytes, mtd_ooblayout_ecc); } EXPORT_SYMBOL_GPL(mtd_ooblayout_set_eccbytes); /** * mtd_ooblayout_get_databytes - extract data bytes from the oob buffer * @mtd: mtd info structure * @databuf: destination buffer to store ECC bytes * @oobbuf: OOB buffer * @start: first ECC byte to retrieve * @nbytes: number of ECC bytes to retrieve * * Works like mtd_ooblayout_get_bytes(), except it acts on free bytes. * * Returns zero on success, a negative error code otherwise. */ int mtd_ooblayout_get_databytes(struct mtd_info *mtd, u8 *databuf, const u8 *oobbu