6 6 6 6 6 6 6 2 2 6 6 6 6 5 5 1 1 5 6 6 6 6 6 6 78 78 78 75 1 1 1 13 7 3 3 3 3 13 78 73 1 6 5 74 6 75 74 74 9 75 75 75 75 75 75 75 75 75 75 75 4 75 75 75 4 75 6 6 1 75 3 3 75 3 3 75 7 7 7 7 7 75 75 75 9 75 7 7 7 2 7 6 7 1 6 3 4 1 4 75 75 8 6 2 8 8 8 5 4 5 5 2 6 6 3 6 8 9 2 9 7 7 6 6 6 6 16 16 16 16 16 16 16 16 16 16 15 15 15 14 15 15 15 15 15 15 15 15 15 15 15 3 14 9 9 9 6 6 9 9 9 9 9 9 9 26 75 8 3 8 5 5 5 5 5 5 9 3 3 6 6 6 3 3 3 3 3 3 3 3 75 75 75 3 75 75 75 75 75 75 74 74 75 8 75 75 22 3 3 3 3 75 74 22 75 75 3 75 74 74 75 75 75 75 75 75 75 75 1 75 75 75 75 75 75 75 74 75 6 74 75 75 74 75 75 75 75 75 75 75 75 75 75 75 75 72 75 74 75 75 75 74 75 75 74 6 6 6 8 8 8 8 2 6 6 5 5 5 5 6 8 8 8 8 8 3 3 3 8 3 3 3 3 3 3 8 5 7 3 3 8 8 8 6 6 6 6 6 6 6 6 3 3 3 6 6 3 6 6 3 146 142 146 147 147 16 16 16 16 147 1 147 11 1 11 1 1 75 75 75 75 75 75 75 75 75 4 75 75 75 75 75 75 75 75 75 75 74 75 12 75 75 75 45 74 74 6 6 75 75 75 75 75 75 75 75 75 75 75 75 74 3 75 75 74 75 75 75 74 75 6 1 6 6 6 73 75 75 75 74 75 75 75 75 24 24 22 22 22 22 22 3 3 24 129 124 125 124 128 129 129 34 24 32 14 21 21 21 21 129 13 13 1 13 13 13 13 18 28 1 114 33 147 1 1 1 1 1 1 1 1 1 1 147 147 147 147 147 146 1 147 147 147 114 113 114 147 145 146 118 142 147 1 2 2 2 2 147 147 1 1 1 146 147 1 1 1 1 147 147 1 147 147 146 147 146 147 147 37 37 37 36 36 3 35 18 36 1 147 146 2 146 147 147 5 5 3 147 145 147 147 118 146 147 147 147 146 147 147 147 147 114 2 2 114 147 147 147 147 5 147 145 147 142 41 147 3 3 3 75 75 75 75 5 5 5 5 5 5 5 5 5 3 75 75 75 75 75 74 75 75 3 75 75 20 20 20 142 3 141 38 36 13 125 142 142 142 142 3 142 142 124 3 3 3 124 125 141 4 4 4 4 4 4 4 1 4 1 1 1 4 4 2 4 4 4 2 4 4 4 4 4 4 1 4 4 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/sort.h> #include <linux/rcupdate.h> #include <linux/kthread.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> #include <linux/lockdep.h> #include <linux/crc32c.h> #include "ctree.h" #include "extent-tree.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" #include "volumes.h" #include "raid56.h" #include "locking.h" #include "free-space-cache.h" #include "free-space-tree.h" #include "qgroup.h" #include "ref-verify.h" #include "space-info.h" #include "block-rsv.h" #include "discard.h" #include "zoned.h" #include "dev-replace.h" #include "fs.h" #include "accessors.h" #include "root-tree.h" #include "file-item.h" #include "orphan.h" #include "tree-checker.h" #include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static int block_group_bits(struct btrfs_block_group *cache, u64 bits) { return (cache->flags & bits) == bits; } /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = len; return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* * helper function to lookup reference count and flags of a tree block. * * the head node for delayed ref is used to store the sum of all the * reference count modifications queued up in the rbtree. the head * node may also store the extent flags to set. This way you can check * to see what the reference count and extent flags would be if all of * the delayed refs are not processed. */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags, u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; u64 owner = 0; int ret; /* * If we don't have skinny metadata, don't bother doing anything * different */ if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { offset = fs_info->nodesize; metadata = 0; } path = btrfs_alloc_path(); if (!path) return -ENOMEM; search_again: key.objectid = bytenr; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == fs_info->nodesize) ret = 0; } } if (ret == 0) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; const u32 item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); if (unlikely(num_refs == 0)) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); } else { num_refs = 0; extent_flags = 0; ret = 0; } delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); /* * Mutex was contended, block until it's released and try * again */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); goto search_again; } spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); WARN_ON(num_refs == 0); if (refs) *refs = num_refs; if (flags) *flags = extent_flags; if (owning_root) *owning_root = owner; return ret; } /* * Back reference rules. Back refs have three main goals: * * 1) differentiate between all holders of references to an extent so that * when a reference is dropped we can make sure it was a valid reference * before freeing the extent. * * 2) Provide enough information to quickly find the holders of an extent * if we notice a given block is corrupted or bad. * * 3) Make it easy to migrate blocks for FS shrinking or storage pool * maintenance. This is actually the same as #2, but with a slightly * different use case. * * There are two kinds of back refs. The implicit back refs is optimized * for pointers in non-shared tree blocks. For a given pointer in a block, * back refs of this kind provide information about the block's owner tree * and the pointer's key. These information allow us to find the block by * b-tree searching. The full back refs is for pointers in tree blocks not * referenced by their owner trees. The location of tree block is recorded * in the back refs. Actually the full back refs is generic, and can be * used in all cases the implicit back refs is used. The major shortcoming * of the full back refs is its overhead. Every time a tree block gets * COWed, we have to update back refs entry for all pointers in it. * * For a newly allocated tree block, we use implicit back refs for * pointers in it. This means most tree related operations only involve * implicit back refs. For a tree block created in old transaction, the * only way to drop a reference to it is COW it. So we can detect the * event that tree block loses its owner tree's reference and do the * back refs conversion. * * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. * * The reference count of the block is one and the tree is not the * block's owner tree. In this case, full back refs is used for pointers * in the block. Remove these full back refs, add implicit back refs for * every pointers in the new block. * * The reference count of the block is greater than one and the tree is * the block's owner tree. In this case, implicit back refs is used for * pointers in the block. Add full back refs for every pointers in the * block, increase lower level extents' reference counts. The original * implicit back refs are entailed to the new block. * * The reference count of the block is greater than one and the tree is * not the block's owner tree. Add implicit back refs for every pointer in * the new block, increase lower level extents' reference count. * * Back Reference Key composing: * * The key objectid corresponds to the first byte in the extent, * The key type is used to differentiate between types of back refs. * There are different meanings of the key offset for different types * of back refs. * * File extents can be referenced by: * * - multiple snapshots, subvolumes, or different generations in one subvol * - different files inside a single subvolume * - different offsets inside a file (bookend extents in file.c) * * The extent ref structure for the implicit back refs has fields for: * * - Objectid of the subvolume root * - objectid of the file holding the reference * - original offset in the file * - how many bookend extents * * The key offset for the implicit back refs is hash of the first * three fields. * * The extent ref structure for the full back refs has field for: * * - number of pointers in the tree leaf * * The key offset for the implicit back refs is the first byte of * the tree leaf * * When a file extent is allocated, The implicit back refs is used. * the fields are filled in: * * (root_key.objectid, inode objectid, offset in file, 1) * * When a file extent is removed file truncation, we find the * corresponding implicit back refs and check the following fields: * * (btrfs_header_owner(leaf), inode objectid, offset in file) * * Btree extents can be referenced by: * * - Different subvolumes * * Both the implicit back refs and the full back refs for tree blocks * only consist of key. The key offset for the implicit back refs is * objectid of block's owner tree. The key offset for the full back refs * is the first byte of parent block. * * When implicit back refs is used, information about the lowest key and * level of the tree block are required. These information are stored in * tree block info structure. */ /* * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, * is_data == BTRFS_REF_TYPE_DATA, data type is requiried, * is_data == BTRFS_REF_TYPE_ANY, either type is OK. */ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); if (type == BTRFS_EXTENT_OWNER_REF_KEY) { ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); return type; } if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || type == BTRFS_EXTENT_DATA_REF_KEY) { if (is_data == BTRFS_REF_TYPE_BLOCK) { if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { ASSERT(is_data == BTRFS_REF_TYPE_ANY); return type; } } WARN_ON(1); btrfs_print_leaf(eb); btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); return BTRFS_REF_TYPE_INVALID; } u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) { u32 high_crc = ~(u32)0; u32 low_crc = ~(u32)0; __le64 lenum; lenum = cpu_to_le64(root_objectid); high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, struct btrfs_extent_data_ref *ref) { return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), btrfs_extent_data_ref_objectid(leaf, ref), btrfs_extent_data_ref_offset(leaf, ref)); } static int match_extent_data_ref(struct extent_buffer *leaf, struct btrfs_extent_data_ref *ref, u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) return 0; return 1; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid, u64 owner, u64 offset) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; u32 nritems; int recow; int ret; key.objectid = bytenr; if (parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; key.offset = parent; } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; key.offset = hash_extent_data_ref(root_objectid, owner, offset); } again: recow = 0; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) return ret; if (parent) { if (ret) return -ENOENT; return 0; } ret = -ENOENT; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); while (1) { if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) { if (ret > 0) return -ENOENT; return ret; } leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); recow = 1; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_DATA_REF_KEY) goto fail; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (match_extent_data_ref(leaf, ref, root_objectid, owner, offset)) { if (recow) { btrfs_release_path(path); goto again; } ret = 0; break; } path->slots[0]++; } fail: return ret; } static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; u64 owner = btrfs_delayed_ref_owner(node); u64 offset = btrfs_delayed_ref_offset(node); u32 size; u32 num_refs; int ret; key.objectid = bytenr; if (node->parent) { key.type = BTRFS_SHARED_DATA_REF_KEY; key.offset = node->parent; size = sizeof(struct btrfs_shared_data_ref); } else { key.type = BTRFS_EXTENT_DATA_REF_KEY; key.offset = hash_extent_data_ref(node->ref_root, owner, offset); size = sizeof(struct btrfs_extent_data_ref); } ret = btrfs_insert_empty_item(trans, root, path, &key, size); if (ret && ret != -EEXIST) goto fail; leaf = path->nodes[0]; if (node->parent) { struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); if (ret == 0) { btrfs_set_shared_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_shared_data_ref_count(leaf, ref); num_refs += node->ref_mod; btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } } else { struct btrfs_extent_data_ref *ref; while (ret == -EEXIST) { ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (match_extent_data_ref(leaf, ref, node->ref_root, owner, offset)) break; btrfs_release_path(path); key.offset++; ret = btrfs_insert_empty_item(trans, root, path, &key, size); if (ret && ret != -EEXIST) goto fail; leaf = path->nodes[0]; } ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); if (ret == 0) { btrfs_set_extent_data_ref_root(leaf, ref, node->ref_root); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); btrfs_set_extent_data_ref_count(leaf, ref, node->ref_mod); } else { num_refs = btrfs_extent_data_ref_count(leaf, ref); num_refs += node->ref_mod; btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } ret = 0; fail: btrfs_release_path(path); return ret; } static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; struct btrfs_shared_data_ref *ref2 = NULL; struct extent_buffer *leaf; u32 num_refs = 0; int ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } else { btrfs_err(trans->fs_info, "unrecognized backref key (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, -EUCLEAN); return -EUCLEAN; } BUG_ON(num_refs < refs_to_drop); num_refs -= refs_to_drop; if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); } return ret; } static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref) { struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref1; struct btrfs_shared_data_ref *ref2; u32 num_refs = 0; int type; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (iref) { /* * If type is invalid, we should have bailed out earlier than * this call. */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); ASSERT(type != BTRFS_REF_TYPE_INVALID); if (type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else { ref2 = (struct btrfs_shared_data_ref *)(iref + 1); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_data_ref); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); } else { WARN_ON(1); } return num_refs; } static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 parent, u64 root_objectid) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; if (parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; key.offset = parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; key.offset = root_objectid; } ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) ret = -ENOENT; return ret; } static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_ref_node *node, u64 bytenr) { struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr); struct btrfs_key key; int ret; key.objectid = bytenr; if (node->parent) { key.type = BTRFS_SHARED_BLOCK_REF_KEY; key.offset = node->parent; } else { key.type = BTRFS_TREE_BLOCK_REF_KEY; key.offset = node->ref_root; } ret = btrfs_insert_empty_item(trans, root, path, &key, 0); btrfs_release_path(path); return ret; } static inline int extent_ref_type(u64 parent, u64 owner) { int type; if (owner < BTRFS_FIRST_FREE_OBJECTID) { if (parent > 0) type = BTRFS_SHARED_BLOCK_REF_KEY; else type = BTRFS_TREE_BLOCK_REF_KEY; } else { if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; } return type; } static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) { for (; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) break; if (path->slots[level] + 1 >= btrfs_header_nritems(path->nodes[level])) continue; if (level == 0) btrfs_item_key_to_cpu(path->nodes[level], key, path->slots[level] + 1); else btrfs_node_key_to_cpu(path->nodes[level], key, path->slots[level] + 1); return 0; } return 1; } /* * look for inline back ref. if back ref is found, *ref_ret is set * to the address of inline back ref, and 0 is returned. * * if back ref isn't found, *ref_ret is set to the address where it * should be inserted, and -ENOENT is returned. * * if insert is true and there are too many inline back refs, the path * points to the extent item, and -EAGAIN is returned. * * NOTE: inline back refs are ordered in the same way that back ref * items in the tree are ordered. */ static noinline_for_stack int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int insert) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; u64 flags; u64 item_size; unsigned long ptr; unsigned long end; int extra_size; int type; int want; int ret; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); int needed; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); path->search_for_extension = 1; } else extra_size = -1; /* * Owner is our level, so we can just add one to get the level for the * block we are interested in. */ if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = owner; } again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); if (ret < 0) goto out; /* * We may be a newly converted file system which still has the old fat * extent entries for metadata, so try and see if we have one of those. */ if (ret > 0 && skinny_metadata) { skinny_metadata = false; if (path->slots[0]) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes) ret = 0; } if (ret) { key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); goto again; } } if (ret && !insert) { ret = -ENOENT; goto out; } else if (WARN_ON(ret)) { btrfs_print_leaf(path->nodes[0]); btrfs_err(fs_info, "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", bytenr, num_bytes, parent, root_objectid, owner, offset); ret = -EUCLEAN; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %llu expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + item_size; if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) { ptr += sizeof(struct btrfs_tree_block_info); BUG_ON(ptr > end); } if (owner >= BTRFS_FIRST_FREE_OBJECTID) needed = BTRFS_REF_TYPE_DATA; else needed = BTRFS_REF_TYPE_BLOCK; ret = -ENOENT; while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); if (type == BTRFS_EXTENT_OWNER_REF_KEY) { ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); ptr += btrfs_extent_inline_ref_size(type); continue; } if (type == BTRFS_REF_TYPE_INVALID) { ret = -EUCLEAN; goto out; } if (want < type) break; if (want > type) { ptr += btrfs_extent_inline_ref_size(type); continue; } if (type == BTRFS_EXTENT_DATA_REF_KEY) { struct btrfs_extent_data_ref *dref; dref = (struct btrfs_extent_data_ref *)(&iref->offset); if (match_extent_data_ref(leaf, dref, root_objectid, owner, offset)) { ret = 0; break; } if (hash_extent_data_ref_item(leaf, dref) < hash_extent_data_ref(root_objectid, owner, offset)) break; } else { u64 ref_offset; ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); if (parent > 0) { if (parent == ref_offset) { ret = 0; break; } if (ref_offset < parent) break; } else { if (root_objectid == ref_offset) { ret = 0; break; } if (ref_offset < root_objectid) break; } } ptr += btrfs_extent_inline_ref_size(type); } if (unlikely(ptr > end)) { ret = -EUCLEAN; btrfs_print_leaf(path->nodes[0]); btrfs_crit(fs_info, "overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", path->slots[0], root_objectid, owner, offset, parent); goto out; } if (ret == -ENOENT && insert) { if (item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { ret = -EAGAIN; goto out; } if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) { struct btrfs_key tmp_key; btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1); if (tmp_key.objectid == bytenr && tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { ret = -EAGAIN; goto out; } goto out_no_entry; } if (!path->keep_locks) { btrfs_release_path(path); path->keep_locks = 1; goto again; } /* * To add new inline back ref, we have to make sure * there is no corresponding back ref item. * For simplicity, we just do not add new inline back * ref if there is any kind of item for this block */ if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { ret = -EAGAIN; goto out; } } out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: if (path->keep_locks) { path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); } if (insert) path->search_for_extension = 0; return ret; } /* * helper to add new inline back ref */ static noinline_for_stack void setup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf; struct btrfs_extent_item *ei; unsigned long ptr; unsigned long end; unsigned long item_offset; u64 refs; int size; int type; leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); item_offset = (unsigned long)iref - (unsigned long)ei; type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); btrfs_extend_item(trans, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); refs += refs_to_add; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); ptr = (unsigned long)ei + item_offset; end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]); if (ptr < end - size) memmove_extent_buffer(leaf, ptr + size, ptr, end - size - ptr); iref = (struct btrfs_extent_inline_ref *)ptr; btrfs_set_extent_inline_ref_type(leaf, iref, type); if (type == BTRFS_EXTENT_DATA_REF_KEY) { struct btrfs_extent_data_ref *dref; dref = (struct btrfs_extent_data_ref *)(&iref->offset); btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); btrfs_set_extent_data_ref_objectid(leaf, dref, owner); btrfs_set_extent_data_ref_offset(leaf, dref, offset); btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); } else if (type == BTRFS_SHARED_DATA_REF_KEY) { struct btrfs_shared_data_ref *sref; sref = (struct btrfs_shared_data_ref *)(iref + 1); btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); btrfs_set_extent_inline_ref_offset(leaf, iref, parent); } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_offset(leaf, iref, parent); } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } } static int lookup_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref **ref_ret, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { int ret; ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr, num_bytes, parent, root_objectid, owner, offset, 0); if (ret != -ENOENT) return ret; btrfs_release_path(path); *ref_ret = NULL; if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = lookup_tree_block_ref(trans, path, bytenr, parent, root_objectid); } else { ret = lookup_extent_data_ref(trans, path, bytenr, parent, root_objectid, owner, offset); } return ret; } /* * helper to update/remove inline back ref */ static noinline_for_stack int update_inline_extent_backref( struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; unsigned long ptr; unsigned long end; u32 item_size; int size; int type; u64 refs; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { struct btrfs_key key; u32 extent_size; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type == BTRFS_METADATA_ITEM_KEY) extent_size = fs_info->nodesize; else extent_size = key.offset; btrfs_print_leaf(leaf); btrfs_err(fs_info, "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", key.objectid, extent_size, refs_to_mod, refs); return -EUCLEAN; } refs += refs_to_mod; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* * Function btrfs_get_extent_inline_ref_type() has already printed * error messages. */ if (unlikely(type == BTRFS_REF_TYPE_INVALID)) return -EUCLEAN; if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); refs = btrfs_extent_data_ref_count(leaf, dref); } else if (type == BTRFS_SHARED_DATA_REF_KEY) { sref = (struct btrfs_shared_data_ref *)(iref + 1); refs = btrfs_shared_data_ref_count(leaf, sref); } else { refs = 1; /* * For tree blocks we can only drop one ref for it, and tree * blocks should not have refs > 1. * * Furthermore if we're inserting a new inline backref, we * won't reach this path either. That would be * setup_inline_extent_backref(). */ if (unlikely(refs_to_mod != -1)) { struct btrfs_key key; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_print_leaf(leaf); btrfs_err(fs_info, "invalid refs_to_mod for tree block %llu, has %d expect -1", key.objectid, refs_to_mod); return -EUCLEAN; } } if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { struct btrfs_key key; u32 extent_size; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type == BTRFS_METADATA_ITEM_KEY) extent_size = fs_info->nodesize; else extent_size = key.offset; btrfs_print_leaf(leaf); btrfs_err(fs_info, "invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", (unsigned long)iref, key.objectid, extent_size, refs_to_mod, refs); return -EUCLEAN; } refs += refs_to_mod; if (refs > 0) { if (type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, dref, refs); else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; end = (unsigned long)ei + item_size; if (ptr + size < end) memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; btrfs_truncate_item(trans, path, item_size, 1); } return 0; } static noinline_for_stack int insert_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_extent_inline_ref *iref; int ret; ret = lookup_inline_extent_backref(trans, path, &iref, bytenr, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { /* * We're adding refs to a tree block we already own, this * should not happen at all. */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, "adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } ret = update_inline_extent_backref(trans, path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; } return ret; } static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); if (iref) ret = update_inline_extent_backref(trans, path, iref, -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else ret = btrfs_del_item(trans, root, path); return ret; } static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, u64 *discarded_bytes) { int j, ret = 0; u64 bytes_left, end; u64 aligned_start = ALIGN(start, SECTOR_SIZE); /* Adjust the range to be aligned to 512B sectors if necessary. */ if (start != aligned_start) { len -= aligned_start - start; len = round_down(len, SECTOR_SIZE); start = aligned_start; } *discarded_bytes = 0; if (!len) return 0; end = start + len; bytes_left = len; /* Skip any superblocks on this device. */ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) { u64 sb_start = btrfs_sb_offset(j); u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE; u64 size = sb_start - start; if (!in_range(sb_start, start, bytes_left) && !in_range(sb_end, start, bytes_left) && !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE)) continue; /* * Superblock spans beginning of range. Adjust start and * try again. */ if (sb_start <= start) { start += sb_end - start; if (start > end) { bytes_left = 0; break; } bytes_left = end - start; continue; } if (size) { ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, size >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) return ret; } start = sb_end; if (start > end) { bytes_left = 0; break; } bytes_left = end - start; } while (bytes_left) { u64 bytes_to_discard = min(BTRFS_MAX_DISCARD_CHUNK_SIZE, bytes_left); ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, bytes_to_discard >> SECTOR_SHIFT, GFP_NOFS); if (ret) { if (ret != -EOPNOTSUPP) break; continue; } start += bytes_to_discard; bytes_left -= bytes_to_discard; *discarded_bytes += bytes_to_discard; if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } } return ret; } static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; u64 phys = stripe->physical; u64 len = stripe->length; u64 discarded = 0; int ret = 0; /* Zone reset on a zoned filesystem */ if (btrfs_can_zone_reset(dev, phys, len)) { u64 src_disc; ret = btrfs_reset_device_zone(dev, phys, len, &discarded); if (ret) goto out; if (!btrfs_dev_replace_is_ongoing(dev_replace) || dev != dev_replace->srcdev) goto out; src_disc = discarded; /* Send to replace target as well */ ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; *bytes = 0; } out: *bytes = discarded; return ret; } int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { int ret = 0; u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; /* * Avoid races with device replace and make sure the devices in the * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { struct btrfs_discard_stripe *stripes; unsigned int num_stripes; int i; num_bytes = end - cur; stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); if (IS_ERR(stripes)) { ret = PTR_ERR(stripes); if (ret == -EOPNOTSUPP) ret = 0; break; } for (i = 0; i < num_stripes; i++) { struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &stripe->dev->dev_state)) continue; ret = do_discard_extent(stripe, &bytes); if (ret) { /* * Keep going if discard is not supported by the * device. */ if (ret != -EOPNOTSUPP) break; ret = 0; } else { discarded_bytes += bytes; } } kfree(stripes); if (ret) break; cur += num_bytes; } btrfs_bio_counter_dec(fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; return ret; } /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && generic_ref->ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); else ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); btrfs_ref_tree_mod(fs_info, generic_ref); return ret; } /* * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for * extent whose references are incremented. * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * */ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; u64 owner = btrfs_delayed_ref_owner(node); u64 offset = btrfs_delayed_ref_offset(node); u64 refs; int refs_to_add = node->ref_mod; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and * inline extent ref, so just update the reference count and add a * normal backref. */ leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); btrfs_release_path(path); /* now insert the actual backref */ if (owner < BTRFS_FIRST_FREE_OBJECTID) ret = insert_tree_block_ref(trans, path, node, bytenr); else ret = insert_extent_data_ref(trans, path, node, bytenr); if (ret) btrfs_abort_transaction(trans, ret); return ret; } static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *href) { u64 root = href->owning_root; /* * Don't check must_insert_reserved, as this is called from contexts * where it has already been unset. */ if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || !href->is_data || !is_fstree(root)) return; btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, BTRFS_QGROUP_RSV_DATA); } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; u64 parent = 0; u64 flags = 0; trace_run_delayed_data_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = node->parent; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { struct btrfs_key key; struct btrfs_squota_delta delta = { .root = href->owning_root, .num_bytes = node->num_bytes, .is_data = true, .is_inc = true, .generation = trans->transid, }; u64 owner = btrfs_delayed_ref_owner(node); u64 offset = btrfs_delayed_ref_offset(node); if (extent_op) flags |= extent_op->flags_to_set; key.objectid = node->bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = node->num_bytes; ret = alloc_reserved_file_extent(trans, parent, node->ref_root, flags, owner, offset, &key, node->ref_mod, href->owning_root); free_head_ref_squota_rsv(trans->fs_info, href); if (!ret) ret = btrfs_record_squota_delta(trans->fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } return ret; } static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei) { u64 flags = btrfs_extent_flags(leaf, ei); if (extent_op->update_flags) { flags |= extent_op->flags_to_set; btrfs_set_extent_flags(leaf, ei, flags); } if (extent_op->update_key) { struct btrfs_tree_block_info *bi; BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); bi = (struct btrfs_tree_block_info *)(ei + 1); btrfs_set_tree_block_key(leaf, bi, &extent_op->key); } } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; int ret; int metadata = 1; if (TRANS_ABORTED(trans)) return 0; if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = head->bytenr; if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = head->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = head->num_bytes; } root = btrfs_extent_root(fs_info, key.objectid); again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == head->bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == head->num_bytes) ret = 0; } if (ret > 0) { btrfs_release_path(path); metadata = 0; key.objectid = head->bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = head->num_bytes; goto again; } } else { ret = -EUCLEAN; btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); return ret; } } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; struct btrfs_fs_info *fs_info = trans->fs_info; u64 parent = 0; u64 ref_root = 0; trace_run_delayed_tree_ref(trans->fs_info, node); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = node->parent; ref_root = node->ref_root; if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { struct btrfs_squota_delta delta = { .root = href->owning_root, .num_bytes = fs_info->nodesize, .is_data = false, .is_inc = true, .generation = trans->transid, }; ret = alloc_reserved_tree_block(trans, node, extent_op); if (!ret) btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { ret = __btrfs_free_extent(trans, href, node, extent_op); } else { BUG(); } return ret; } /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; if (TRANS_ABORTED(trans)) { if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); free_head_ref_squota_rsv(trans->fs_info, href); } return 0; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) ret = 0; else BUG(); if (ret && insert_reserved) btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); if (ret < 0) btrfs_err(trans->fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", node->bytenr, node->num_bytes, node->type, node->action, node->ref_mod, ret); return ret; } static struct btrfs_delayed_extent_op *cleanup_extent_op( struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; if (!extent_op) return NULL; if (head->must_insert_reserved) { head->extent_op = NULL; btrfs_free_delayed_extent_op(extent_op); return NULL; } return extent_op; } static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_extent_op *extent_op; int ret; extent_op = cleanup_extent_op(head); if (!extent_op) return 0; head->extent_op = NULL; spin_unlock(&head->lock); ret = run_delayed_extent_op(trans, head, extent_op); btrfs_free_delayed_extent_op(extent_op); return ret ? ret : 1; } u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { u64 ret = 0; /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. */ if (head->total_ref_mod < 0 && head->is_data) { int nr_csums; spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } /* must_insert_reserved can be set only if we didn't run the head ref. */ if (head->must_insert_reserved) free_head_ref_squota_rsv(fs_info, head); return ret; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; int ret; delayed_refs = &trans->transaction->delayed_refs; ret = run_and_cleanup_extent_op(trans, head); if (ret < 0) { btrfs_unselect_ref_head(delayed_refs, head); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); return ret; } else if (ret) { return ret; } /* * Need to drop our head ref lock and re-acquire the delayed ref lock * and then re-check to make sure nobody got added. */ spin_unlock(&head->lock); spin_lock(&delayed_refs->lock); spin_lock(&head->lock); if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) { spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); return 1; } btrfs_delete_ref_head(fs_info, delayed_refs, head); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); if (head->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(fs_info, head->bytenr); ret = btrfs_del_csums(trans, csum_root, head->bytenr, head->num_bytes); } } *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); return ret; } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref, u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_extent_op *extent_op; struct btrfs_delayed_ref_node *ref; bool must_insert_reserved; int ret; delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&locked_ref->mutex); lockdep_assert_held(&locked_ref->lock); while ((ref = btrfs_select_delayed_ref(locked_ref))) { if (ref->seq && btrfs_check_delayed_seq(fs_info, ref->seq)) { spin_unlock(&locked_ref->lock); btrfs_unselect_ref_head(delayed_refs, locked_ref); return -EAGAIN; } rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); /* * When we play the delayed ref, also correct the ref_mod on * head */ switch (ref->action) { case BTRFS_ADD_DELAYED_REF: case BTRFS_ADD_DELAYED_EXTENT: locked_ref->ref_mod -= ref->ref_mod; break; case BTRFS_DROP_DELAYED_REF: locked_ref->ref_mod += ref->ref_mod; break; default: WARN_ON(1); } /* * Record the must_insert_reserved flag before we drop the * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; /* * Unsetting this on the head ref relinquishes ownership of * the rsv_bytes, so it is critical that every possible code * path from here forward frees all reserves including qgroup * reserve. */ locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, must_insert_reserved); btrfs_delayed_refs_rsv_release(fs_info, 1, 0); *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); btrfs_free_delayed_extent_op(extent_op); if (ret) { btrfs_unselect_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); return ret; } btrfs_put_delayed_ref(ref); cond_resched(); spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); } return 0; } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; unsigned long count = 0; unsigned long max_count = 0; u64 bytes_processed = 0; delayed_refs = &trans->transaction->delayed_refs; if (min_bytes == 0) { max_count = delayed_refs->num_heads_ready; min_bytes = U64_MAX; } do { if (!locked_ref) { locked_ref = btrfs_select_ref_head(fs_info, delayed_refs); if (IS_ERR_OR_NULL(locked_ref)) { if (PTR_ERR(locked_ref) == -EAGAIN) { continue; } else { break; } } count++; } /* * We need to try and merge add/drops of the same ref since we * can run into issues with relocate dropping the implicit ref * and then it being added back again before the drop can * finish. If we merged anything we need to re-loop so we can * get a good ref. * Or we can get node references of the same type that weren't * merged when created due to bumps in the tree mod seq, and * we need to merge them to prevent adding an inline extent * backref before dropping it (triggering a BUG_ON at * insert_inline_extent_backref()). */ spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already * unlocked everything so just bail out */ return ret; } else if (!ret) { /* * Success, perform the usual cleanup of a processed * head */ ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; continue; } else if (ret) { return ret; } } /* * Either success case or btrfs_run_delayed_refs_for_head * returned -EAGAIN, meaning we need to select another head */ locked_ref = NULL; cond_resched(); } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || (max_count > 0 && count < max_count) || locked_ref); return 0; } #ifdef SCRAMBLE_DELAYED_REFS /* * Normally delayed refs get processed in ascending bytenr order. This * correlates in most cases to the order added. To expose dependencies on this * order, we start to process the tree in the middle instead of the beginning */ static u64 find_middle(struct rb_root *root) { struct rb_node *n = root->rb_node; struct btrfs_delayed_ref_node *entry; int alt = 1; u64 middle; u64 first = 0, last = 0; n = rb_first(root); if (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); first = entry->bytenr; } n = rb_last(root); if (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); last = entry->bytenr; } n = root->rb_node; while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); WARN_ON(!entry->in_tree); middle = entry->bytenr; if (alt) n = n->rb_left; else n = n->rb_right; alt = 1 - alt; } return middle; } #endif /* * Start processing the delayed reference count updates and extent insertions * we have queued up so far. * * @trans: Transaction handle. * @min_bytes: How many bytes of delayed references to process. After this * many bytes we stop processing delayed references if there are * any more. If 0 it means to run all existing delayed references, * but not new ones added after running all existing ones. * Use (u64)-1 (U64_MAX) to run all existing delayed references * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; int ret; /* We'll clean this up in btrfs_cleanup_transaction */ if (TRANS_ABORTED(trans)) return 0; if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) return 0; delayed_refs = &trans->transaction->delayed_refs; again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif ret = __btrfs_run_delayed_refs(trans, min_bytes); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; } if (min_bytes == U64_MAX) { btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); if (xa_empty(&delayed_refs->head_refs)) { spin_unlock(&delayed_refs->lock); return 0; } spin_unlock(&delayed_refs->lock); cond_resched(); goto again; } return 0; } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; int ret; extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) return -ENOMEM; extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, btrfs_header_level(eb), extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); return ret; } static noinline int check_delayed_ref(struct btrfs_inode *inode, struct btrfs_path *path, u64 offset, u64 bytenr) { struct btrfs_root *root = inode->root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; struct rb_node *node; int ret = 0; spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; if (cur_trans) refcount_inc(&cur_trans->use_count); spin_unlock(&root->fs_info->trans_lock); if (!cur_trans) return 0; delayed_refs = &cur_trans->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); return 0; } if (!mutex_trylock(&head->mutex)) { if (path->nowait) { spin_unlock(&delayed_refs->lock); btrfs_put_transaction(cur_trans); return -EAGAIN; } refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); /* * Mutex was contended, block until it's released and let * caller try again */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); btrfs_put_transaction(cur_trans); return -EAGAIN; } spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); /* * XXX: We should replace this with a proper search function in the * future. */ for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { u64 ref_owner; u64 ref_offset; ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; break; } ref_owner = btrfs_delayed_ref_owner(ref); ref_offset = btrfs_delayed_ref_offset(ref); /* * If our ref doesn't match the one we're currently looking at * then we have a cross reference. */ if (ref->ref_root != btrfs_root_id(root) || ref_owner != btrfs_ino(inode) || ref_offset != offset) { ret = 1; break; } } spin_unlock(&head->lock); mutex_unlock(&head->mutex); btrfs_put_transaction(cur_trans); return ret; } /* * Check if there are references for a data extent other than the one belonging * to the given inode and offset. * * @inode: The only inode we expect to find associated with the data extent. * @path: A path to use for searching the extent tree. * @offset: The only offset we expect to find associated with the data extent. * @bytenr: The logical address of the data extent. * * When the extent does not have any other references other than the one we * expect to find, we always return a value of 0 with the path having a locked * leaf that contains the extent's extent item - this is necessary to ensure * we don't race with a task running delayed references, and our caller must * have such a path when calling check_delayed_ref() - it must lock a delayed * ref head while holding the leaf locked. In case the extent item is not found * in the extent tree, we return -ENOENT with the path having the leaf (locked) * where the extent item should be, in order to prevent races with another task * running delayed references, so that we don't miss any reference when calling * check_delayed_ref(). * * Note: this may return false positives, and this is because we want to be * quick here as we're called in write paths (when flushing delalloc and * in the direct IO write path). For example we can have an extent with * a single reference but that reference is not inlined, or we may have * many references in the extent tree but we also have delayed references * that cancel all the reference except the one for our inode and offset, * but it would be expensive to do such checks and complex due to all * locking to avoid races between the checks and flushing delayed refs, * plus non-inline references may be located on leaves other than the one * that contains the extent item in the extent tree. The important thing * here is to not return false negatives and that the false positives are * not very common. * * Returns: 0 if there are no cross references and with the path having a locked * leaf from the extent tree that contains the extent's extent item. * * 1 if there are cross references (false positives can happen). * * < 0 in case of an error. In case of -ENOENT the leaf in the extent * tree where the extent item should be located at is read locked and * accessible in the given path. */ static noinline int check_committed_ref(struct btrfs_inode *inode, struct btrfs_path *path, u64 offset, u64 bytenr) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr); struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; u32 expected_size; int type; int ret; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist an extent * item with such offset, but this is out of the valid range. */ return -EUCLEAN; } if (path->slots[0] == 0) return -ENOENT; path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) return -ENOENT; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); /* No inline refs; we need to bail before checking for owner ref. */ if (item_size == sizeof(*ei)) return 1; /* Check for an owner ref; skip over it to the real inline refs. */ iref = (struct btrfs_extent_inline_ref *)(ei + 1); type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); iref = (struct btrfs_extent_inline_ref *)(iref + 1); type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); } /* If extent item has more than 1 inline ref then it's shared */ if (item_size != expected_size) return 1; /* If this extent has SHARED_DATA_REF then it's shared */ if (type != BTRFS_EXTENT_DATA_REF_KEY) return 1; ref = (struct btrfs_extent_data_ref *)(&iref->offset); if (btrfs_extent_refs(leaf, ei) != btrfs_extent_data_ref_count(leaf, ref) || btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) || btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) || btrfs_extent_data_ref_offset(leaf, ref) != offset) return 1; return 0; } int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr, struct btrfs_path *path) { int ret; do { ret = check_committed_ref(inode, path, offset, bytenr); if (ret && ret != -ENOENT) goto out; /* * The path must have a locked leaf from the extent tree where * the extent item for our extent is located, in case it exists, * or where it should be located in case it doesn't exist yet * because it's new and its delayed ref was not yet flushed. * We need to lock the delayed ref head at check_delayed_ref(), * if one exists, while holding the leaf locked in order to not * race with delayed ref flushing, missing references and * incorrectly reporting that the extent is not shared. */ if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { struct extent_buffer *leaf = path->nodes[0]; ASSERT(leaf != NULL); btrfs_assert_tree_read_locked(leaf); if (ret != -ENOENT) { struct btrfs_key key; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ASSERT(key.objectid == bytenr); ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY); } } ret = check_delayed_ref(inode, path, offset, bytenr); } while (ret == -EAGAIN && !path->nowait); out: btrfs_release_path(path); if (btrfs_is_data_reloc_root(inode->root)) WARN_ON(ret > 0); return ret; } static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref, int inc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 parent; u64 ref_root; u32 nritems; struct btrfs_key key; struct btrfs_file_extent_item *fi; bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC); int i; int action; int level; int ret = 0; if (btrfs_is_testing(fs_info)) return 0; ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0) return 0; if (full_backref) parent = buf->start; else parent = 0; if (inc) action = BTRFS_ADD_DELAYED_REF; else action = BTRFS_DROP_DELAYED_REF; for (i = 0; i < nritems; i++) { struct btrfs_ref ref = { .action = action, .parent = parent, .ref_root = ref_root, }; if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) continue; ref.bytenr = btrfs_file_extent_disk_bytenr(buf, fi); if (ref.bytenr == 0) continue; ref.num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); ref.owning_root = ref_root; key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_data_ref(&ref, key.objectid, key.offset, btrfs_root_id(root), for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &ref); else ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } else { /* We don't know the owning_root, leave as 0. */ ref.bytenr = btrfs_node_blockptr(buf, i); ref.num_bytes = fs_info->nodesize; btrfs_init_tree_ref(&ref, level - 1, btrfs_root_id(root), for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &ref); else ret = btrfs_free_extent(trans, &ref); if (ret) goto fail; } } return 0; fail: return ret; } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref) { return __btrfs_mod_ref(trans, root, buf, full_backref, 1); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref) { return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; else if (root == fs_info->chunk_root) flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; ret = btrfs_get_alloc_profile(fs_info, flags); return ret; } static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { struct rb_node *leftmost; u64 bytenr = 0; read_lock(&fs_info->block_group_cache_lock); /* Get the block group with the lowest logical start address. */ leftmost = rb_first_cached(&fs_info->block_group_cache_tree); if (leftmost) { struct btrfs_block_group *bg; bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); bytenr = bg->start; } read_unlock(&fs_info->block_group_cache_lock); return bytenr; } static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ pin_down_extent(trans, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; } int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret; cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) return -EINVAL; /* * Fully cache the free space first so that our pin removes the free space * from the cache. */ ret = btrfs_cache_block_group(cache, true); if (ret) goto out; pin_down_extent(trans, cache, eb->start, eb->len, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, eb->start, eb->len); out: btrfs_put_block_group(cache); return ret; } static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { int ret; struct btrfs_block_group *block_group; block_group = btrfs_lookup_block_group(fs_info, start); if (!block_group) return -EINVAL; ret = btrfs_cache_block_group(block_group, true); if (ret) goto out; ret = btrfs_remove_free_space(block_group, start, num_bytes); out: btrfs_put_block_group(block_group); return ret; } int btrfs_exclude_logged_extents(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_file_extent_item *item; struct btrfs_key key; int found_type; int i; int ret = 0; if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) return 0; for (i = 0; i < btrfs_header_nritems(eb); i++) { btrfs_item_key_to_cpu(eb, &key, i); if (key.type != BTRFS_EXTENT_DATA_KEY) continue; item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(eb, item); if (found_type == BTRFS_FILE_EXTENT_INLINE) continue; if (btrfs_file_extent_disk_bytenr(eb, item) == 0) continue; key.objectid = btrfs_file_extent_disk_bytenr(eb, item); key.offset = btrfs_file_extent_disk_num_bytes(eb, item); ret = __exclude_logged_extent(fs_info, key.objectid, key.offset); if (ret) break; } return ret; } static void btrfs_inc_block_group_reservations(struct btrfs_block_group *bg) { atomic_inc(&bg->reservations); } /* * Returns the free cluster for the given space info and sets empty_cluster to * what it should be based on the mount options. */ static struct btrfs_free_cluster * fetch_cluster_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) return ret; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &fs_info->meta_alloc_cluster; if (btrfs_test_opt(fs_info, SSD)) *empty_cluster = SZ_2M; else *empty_cluster = SZ_64K; } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(fs_info, SSD_SPREAD)) { *empty_cluster = SZ_2M; ret = &fs_info->data_alloc_cluster; } return ret; } static int unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end, const bool return_free_space) { struct btrfs_block_group *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_free_cluster *cluster = NULL; u64 total_unpinned = 0; u64 empty_cluster = 0; bool readonly; int ret = 0; while (start <= end) { u64 len; readonly = false; if (!cache || start >= cache->start + cache->length) { if (cache) btrfs_put_block_group(cache); total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); if (cache == NULL) { /* Logic error, something removed the block group. */ ret = -EUCLEAN; goto out; } cluster = fetch_cluster_info(fs_info, cache->space_info, &empty_cluster); empty_cluster <<= 1; } len = cache->start + cache->length - start; len = min(len, end + 1 - start); if (return_free_space) btrfs_add_free_space(cache, start, len); start += len; total_unpinned += len; space_info = cache->space_info; /* * If this space cluster has been marked as fragmented and we've * unpinned enough in this block group to potentially allow a * cluster to be created inside of it go ahead and clear the * fragmented check. */ if (cluster && cluster->fragmented && total_unpinned > empty_cluster) { spin_lock(&cluster->lock); cluster->fragmented = 0; spin_unlock(&cluster->lock); } spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; if (cache->ro) { space_info->bytes_readonly += len; readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ btrfs_space_info_update_bytes_zone_unusable(space_info, len); readonly = true; } spin_unlock(&cache->lock); if (!readonly && return_free_space) btrfs_return_free_space(space_info, len); spin_unlock(&space_info->lock); } if (cache) btrfs_put_block_group(cache); out: return ret; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; struct extent_io_tree *unpin; u64 start; u64 end; int ret; unpin = &trans->transaction->pinned_extents; while (!TRANS_ABORTED(trans)) { struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); if (!find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, &cached_state); ret = unpin_extent_range(fs_info, start, end, true); BUG_ON(ret); mutex_unlock(&fs_info->unused_bg_unpin_mutex); free_extent_state(cached_state); cond_resched(); } if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); btrfs_discard_schedule_work(&fs_info->discard_ctl, true); } /* * Transaction is finished. We don't need the lock anymore. We * do need to clean up the block groups in case of a transaction * abort. */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { u64 trimmed = 0; ret = -EROFS; if (!TRANS_ABORTED(trans)) ret = btrfs_discard_extent(fs_info, block_group->start, block_group->length, &trimmed); /* * Not strictly necessary to lock, as the block_group should be * read-only from btrfs_delete_unused_bgs(). */ ASSERT(block_group->ro); spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); spin_unlock(&fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); if (ret) { const char *errstr = btrfs_decode_error(ret); btrfs_warn(fs_info, "discard failed while removing blockgroup: errno=%d %s", ret, errstr); } } return 0; } /* * Parse an extent item's inline extents looking for a simple quotas owner ref. * * @fs_info: the btrfs_fs_info for this mount * @leaf: a leaf in the extent tree containing the extent item * @slot: the slot in the leaf where the extent item is found * * Returns the objectid of the root that originally allocated the extent item * if the inline owner ref is expected and present, otherwise 0. * * If an extent item has an owner ref item, it will be the first inline ref * item. Therefore the logic is to check whether there are any inline ref * items, then check the type of the first one. */ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_owner_ref *oref; unsigned long ptr; unsigned long end; int type; if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) return 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); ptr = (unsigned long)(ei + 1); end = (unsigned long)ei + btrfs_item_size(leaf, slot); /* No inline ref items of any kind, can't check type. */ if (ptr == end) return 0; iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* We found an owner ref, get the root out of it. */ if (type == BTRFS_EXTENT_OWNER_REF_KEY) { oref = (struct btrfs_extent_owner_ref *)(&iref->offset); return btrfs_extent_owner_ref_root_id(leaf, oref); } /* We have inline refs, but not an owner ref. */ return 0; } static int do_free_extent_accounting(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_squota_delta *delta) { int ret; u64 num_bytes = delta->num_bytes; if (delta->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(trans->fs_info, bytenr); ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } } ret = btrfs_record_squota_delta(trans->fs_info, delta); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = add_to_free_space_tree(trans, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); if (ret) btrfs_abort_transaction(trans, ret); return ret; } #define abort_and_dump(trans, path, fmt, args...) \ ({ \ btrfs_abort_transaction(trans, -EUCLEAN); \ btrfs_print_leaf(path->nodes[0]); \ btrfs_crit(trans->fs_info, fmt, ##args); \ }) /* * Drop one or more refs of @node. * * 1. Locate the extent refs. * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. * Locate it, then reduce the refs number or remove the ref line completely. * * 2. Update the refs count in EXTENT/METADATA_ITEM * * Inline backref case: * * in extent tree we have: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 * refs 2 gen 6 flags DATA * extent data backref root FS_TREE objectid 258 offset 0 count 1 * extent data backref root FS_TREE objectid 257 offset 0 count 1 * * This function gets called with: * * node->bytenr = 13631488 * node->num_bytes = 1048576 * root_objectid = FS_TREE * owner_objectid = 257 * owner_offset = 0 * refs_to_drop = 1 * * Then we should get some like: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 * refs 1 gen 6 flags DATA * extent data backref root FS_TREE objectid 258 offset 0 count 1 * * Keyed backref case: * * in extent tree we have: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 * refs 754 gen 6 flags DATA * [...] * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28 * extent data backref root FS_TREE objectid 866 offset 0 count 1 * * This function get called with: * * node->bytenr = 13631488 * node->num_bytes = 1048576 * root_objectid = FS_TREE * owner_objectid = 866 * owner_offset = 0 * refs_to_drop = 1 * * Then we should get some like: * * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 * refs 753 gen 6 flags DATA * * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; struct btrfs_path *path; struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; int ret; int is_data; int extent_slot = 0; int found_extent = 0; int num_to_del = 1; int refs_to_drop = node->ref_mod; u32 item_size; u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; u64 owner_objectid = btrfs_delayed_ref_owner(node); u64 owner_offset = btrfs_delayed_ref_offset(node); bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); u64 delayed_ref_root = href->owning_root; extent_root = btrfs_extent_root(info, bytenr); ASSERT(extent_root); path = btrfs_alloc_path(); if (!path) return -ENOMEM; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; if (!is_data && refs_to_drop != 1) { btrfs_crit(info, "invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", node->bytenr, refs_to_drop); ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } if (is_data) skinny_metadata = false; ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes, node->parent, node->ref_root, owner_objectid, owner_offset); if (ret == 0) { /* * Either the inline backref or the SHARED_DATA_REF/ * SHARED_BLOCK_REF is found * * Here is a quick path to locate EXTENT/METADATA_ITEM. * It's possible the EXTENT/METADATA_ITEM is near current slot. */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, extent_slot); if (key.objectid != bytenr) break; if (key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes) { found_extent = 1; break; } if (key.type == BTRFS_METADATA_ITEM_KEY && key.offset == owner_objectid) { found_extent = 1; break; } /* Quick path didn't find the EXTENT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { if (iref) { abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); ret = -EUCLEAN; goto out; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; if (!is_data && skinny_metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = owner_objectid; } ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret > 0 && skinny_metadata && path->slots[0]) { /* * Couldn't find our skinny metadata item, * see if we have ye olde extent item. */ path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && key.offset == num_bytes) ret = 0; } if (ret > 0 && skinny_metadata) { skinny_metadata = false; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; btrfs_release_path(path); ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); } if (ret) { if (ret > 0) btrfs_print_leaf(path->nodes[0]); btrfs_err(info, "umm, got %d back from search, was looking for %llu, slot %d", ret, bytenr, path->slots[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { abort_and_dump(trans, path, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); goto out; } else { btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { ret = -EUCLEAN; btrfs_err(trans->fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; if (item_size < sizeof(*ei) + sizeof(*bi)) { abort_and_dump(trans, path, "invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, path->slots[0], owner_objectid, item_size, sizeof(*ei) + sizeof(*bi)); ret = -EUCLEAN; goto out; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); ret = -EUCLEAN; goto out; } refs -= refs_to_drop; if (refs > 0) { if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* * In the case of inline back ref, reference count will * be updated by remove_extent_backref */ if (iref) { if (!found_extent) { abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); ret = -EUCLEAN; goto out; } } else { btrfs_set_extent_refs(leaf, ei, refs); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } } else { struct btrfs_squota_delta delta = { .root = delayed_ref_root, .num_bytes = num_bytes, .is_data = is_data, .is_inc = false, .generation = btrfs_extent_generation(leaf, ei), }; /* In this branch refs == 1 */ if (found_extent) { if (is_data && refs_to_drop != extent_data_ref_count(path, iref)) { abort_and_dump(trans, path, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), refs_to_drop, path->slots[0]); ret = -EUCLEAN; goto out; } if (iref) { if (path->slots[0] != extent_slot) { abort_and_dump(trans, path, "invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", key.objectid, key.type, key.offset, path->slots[0]); ret = -EUCLEAN; goto out; } } else { /* * No inline ref, we must be at SHARED_* item, * And it's single ref, it must be: * | extent_slot ||extent_slot + 1| * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ if (path->slots[0] != extent_slot + 1) { abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); ret = -EUCLEAN; goto out; } path->slots[0] = extent_slot; num_to_del = 2; } } /* * We can't infer the data owner from the delayed ref, so we need * to try to get it from the owning ref item. * * If it is not present, then that extent was not written under * simple quotas mode, so we don't need to account for its deletion. */ if (is_data) delta.root = btrfs_get_extent_owner_root(trans->fs_info, leaf, extent_slot); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); ret = do_free_extent_accounting(trans, bytenr, &delta); } btrfs_release_path(path); out: btrfs_free_path(path); return ret; } /* * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. */ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, u64 bytenr) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr); if (!head) goto out_delayed_unlock; spin_lock(&head->lock); if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root)) goto out; if (cleanup_extent_op(head) != NULL) goto out; /* * waiting for the lock here would deadlock. If someone else has it * locked they are already in the process of dropping it anyway */ if (!mutex_trylock(&head->mutex)) goto out; btrfs_delete_ref_head(fs_info, delayed_refs, head); head->processing = false; spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); if (head->must_insert_reserved) ret = 1; btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); return ret; out: spin_unlock(&head->lock); out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } int btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, struct extent_buffer *buf, u64 parent, int last_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *bg; int ret; if (root_id != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_ref generic_ref = { .action = BTRFS_DROP_DELAYED_REF, .bytenr = buf->start, .num_bytes = buf->len, .parent = parent, .owning_root = btrfs_header_owner(buf), .ref_root = root_id, }; /* * Assert that the extent buffer is not cleared due to * EXTENT_BUFFER_ZONED_ZEROOUT. Please refer * btrfs_clear_buffer_dirty() and btree_csum_one_bio() for * detail. */ ASSERT(btrfs_header_bytenr(buf) != 0); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), 0, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); if (ret < 0) return ret; } if (!last_ref) return 0; if (btrfs_header_generation(buf) != trans->transid) goto out; if (root_id != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); if (!ret) goto out; } bg = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(trans, bg, buf->start, buf->len, 1); btrfs_put_block_group(bg); goto out; } /* * If there are tree mod log users we may have recorded mod log * operations for this node. If we re-allocate this node we * could replay operations on this node that happened when it * existed in a completely different root. For example if it * was part of root A, then was reallocated to root B, and we * are doing a btrfs_old_search_slot(root b), we could replay * operations that happened when the block was part of root A, * giving us an inconsistent view of the btree. * * We are safe from races here because at this point no other * node or root points to this extent buffer, so if after this * check a new tree mod log user joins we will not have an * existing log of operations on this node that we have to * contend with. */ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) || btrfs_is_zoned(fs_info)) { pin_down_extent(trans, bg, buf->start, buf->len, 1); btrfs_put_block_group(bg); goto out; } WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(bg, buf->start, buf->len); btrfs_free_reserved_bytes(bg, buf->len, 0); btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); out: /* * Deleting the buffer, clear the corrupt flag since it doesn't * matter anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); return 0; } /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; if (btrfs_is_testing(fs_info)) return 0; /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. */ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); } else { ret = btrfs_add_delayed_data_ref(trans, ref, 0); } if (ref->ref_root != BTRFS_TREE_LOG_OBJECTID) btrfs_ref_tree_mod(fs_info, ref); return ret; } enum btrfs_loop_type { /* * Start caching block groups but do not wait for progress or for them * to be done. */ LOOP_CACHING_NOWAIT, /* * Wait for the block group free_space >= the space we're waiting for if * the block group isn't cached. */ LOOP_CACHING_WAIT, /* * Allow allocations to happen from block groups that do not yet have a * size classification. */ LOOP_UNSET_SIZE_CLASS, /* * Allocate a chunk and then retry the allocation. */ LOOP_ALLOC_CHUNK, /* * Ignore the size class restrictions for this allocation. */ LOOP_WRONG_SIZE_CLASS, /* * Ignore the empty size, only try to allocate the number of bytes * needed for this allocation. */ LOOP_NO_EMPTY_SIZE, }; static inline void btrfs_lock_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) down_read(&cache->data_rwsem); } static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, int delalloc) { btrfs_get_block_group(cache); if (delalloc) down_read(&cache->data_rwsem); } static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, int delalloc) __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; spin_lock(&cluster->refill_lock); while (1) { used_bg = cluster->block_group; if (!used_bg) return NULL; if (used_bg == block_group) return used_bg; btrfs_get_block_group(used_bg); if (!delalloc) return used_bg; if (down_read_trylock(&used_bg->data_rwsem)) return used_bg; spin_unlock(&cluster->refill_lock); /* We should only have one-level nested. */ down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); spin_lock(&cluster->refill_lock); if (used_bg == cluster->block_group) return used_bg; up_read(&used_bg->data_rwsem); btrfs_put_block_group(used_bg); } } static inline void btrfs_release_block_group(struct btrfs_block_group *cache, int delalloc) { if (delalloc) up_read(&cache->data_rwsem); btrfs_put_block_group(cache); } /* * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ static int find_free_extent_clustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **cluster_bg_ret) { struct btrfs_block_group *cluster_bg; struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 aligned_cluster; u64 offset; int ret; cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc); if (!cluster_bg) goto refill_cluster; if (cluster_bg != bg && (cluster_bg->ro || !block_group_bits(cluster_bg, ffe_ctl->flags))) goto release_cluster; offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, ffe_ctl->num_bytes, cluster_bg->start, &ffe_ctl->max_extent_size); if (offset) { /* We have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(cluster_bg, ffe_ctl); *cluster_bg_ret = cluster_bg; ffe_ctl->found_offset = offset; return 0; } WARN_ON(last_ptr->block_group != cluster_bg); release_cluster: /* * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so * lets just skip it and let the allocator find whatever block it can * find. If we reach this point, we will have tried the cluster * allocator plenty of times and not have found anything, so we are * likely way too fragmented for the clustering stuff to find anything. * * However, if the cluster is taken from the current block group, * release the cluster first, so that we stand a better chance of * succeeding in the unclustered allocation. */ if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) { spin_unlock(&last_ptr->refill_lock); btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); return -ENOENT; } /* This cluster didn't work out, free it and start over */ btrfs_return_cluster_to_free_space(NULL, last_ptr); if (cluster_bg != bg) btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc); refill_cluster: if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) { spin_unlock(&last_ptr->refill_lock); return -ENOENT; } aligned_cluster = max_t(u64, ffe_ctl->empty_cluster + ffe_ctl->empty_size, bg->full_stripe_len); ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start, ffe_ctl->num_bytes, aligned_cluster); if (ret == 0) { /* Now pull our allocation out of this cluster */ offset = btrfs_alloc_from_cluster(bg, last_ptr, ffe_ctl->num_bytes, ffe_ctl->search_start, &ffe_ctl->max_extent_size); if (offset) { /* We found one, proceed */ spin_unlock(&last_ptr->refill_lock); ffe_ctl->found_offset = offset; trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } } /* * At this point we either didn't find a cluster or we weren't able to * allocate a block from our cluster. Free the cluster we've been * trying to use, and go to the next block group. */ btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); return 1; } /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) { struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; u64 offset; /* * We are doing an unclustered allocation, set the fragmented flag so * we don't bother trying to setup a cluster again until we get more * space. */ if (unlikely(last_ptr)) { spin_lock(&last_ptr->lock); last_ptr->fragmented = 1; spin_unlock(&last_ptr->lock); } if (ffe_ctl->cached) { struct btrfs_free_space_ctl *free_space_ctl; free_space_ctl = bg->free_space_ctl; spin_lock(&free_space_ctl->tree_lock); if (free_space_ctl->free_space < ffe_ctl->num_bytes + ffe_ctl->empty_cluster + ffe_ctl->empty_size) { ffe_ctl->total_free_space = max_t(u64, ffe_ctl->total_free_space, free_space_ctl->free_space); spin_unlock(&free_space_ctl->tree_lock); return 1; } spin_unlock(&free_space_ctl->tree_lock); } offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, ffe_ctl->num_bytes, ffe_ctl->empty_size, &ffe_ctl->max_extent_size); if (!offset) return 1; ffe_ctl->found_offset = offset; return 0; } static int do_allocation_clustered(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { int ret; /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); if (ret >= 0) return ret; /* ret == -ENOENT case falls through */ } return find_free_extent_unclustered(block_group, ffe_ctl); } /* * Tree-log block group locking * ============================ * * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which * indicates the starting address of a block group, which is reserved only * for tree-log metadata. * * Lock nesting * ============ * * space_info::lock * block_group::lock * fs_info::treelog_bg_lock */ /* * Simple allocator for sequential-only block group. It only allows sequential * allocation. No need to play with trees. This function also reserves the * bytes as in btrfs_add_reserved_bytes. */ static int do_allocation_zoned(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 start = block_group->start; u64 num_bytes = ffe_ctl->num_bytes; u64 avail; u64 bytenr = block_group->start; u64 log_bytenr; u64 data_reloc_bytenr; int ret = 0; bool skip = false; ASSERT(btrfs_is_zoned(block_group->fs_info)); /* * Do not allow non-tree-log blocks in the dedicated tree-log block * group, and vice versa. */ spin_lock(&fs_info->treelog_bg_lock); log_bytenr = fs_info->treelog_bg; if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || (!ffe_ctl->for_treelog && bytenr == log_bytenr))) skip = true; spin_unlock(&fs_info->treelog_bg_lock); if (skip) return 1; /* * Do not allow non-relocation blocks in the dedicated relocation block * group, and vice versa. */ spin_lock(&fs_info->relocation_bg_lock); data_reloc_bytenr = fs_info->data_reloc_bg; if (data_reloc_bytenr && ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) skip = true; spin_unlock(&fs_info->relocation_bg_lock); if (skip) return 1; /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. * Return the error after taking the locks. */ } spin_unlock(&block_group->lock); /* Metadata block group is activated at write time. */ if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && !btrfs_zone_activate(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. * Return the error after taking the locks. */ } spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); spin_lock(&fs_info->relocation_bg_lock); if (ret) goto out; ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); ASSERT(!ffe_ctl->for_data_reloc || block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); if (block_group->ro || (!ffe_ctl->for_data_reloc && test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { ret = 1; goto out; } /* * Do not allow currently using block group to be tree-log dedicated * block group. */ if (ffe_ctl->for_treelog && !fs_info->treelog_bg && (block_group->used || block_group->reserved)) { ret = 1; goto out; } /* * Do not allow currently used block group to be the data relocation * dedicated block group. */ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && (block_group->used || block_group->reserved)) { ret = 1; goto out; } WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); avail = block_group->zone_capacity - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { /* * With sequential allocator, free space is always * contiguous */ ffe_ctl->max_extent_size = avail; ffe_ctl->total_free_space = avail; } ret = 1; goto out; } if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; if (ffe_ctl->for_data_reloc) { if (!fs_info->data_reloc_bg) fs_info->data_reloc_bg = block_group->start; /* * Do not allow allocations from this block group, unless it is * for data relocation. Compared to increasing the ->ro, setting * the ->zoned_data_reloc_ongoing flag still allows nocow * writers to come in. See btrfs_inc_nocow_writers(). * * We need to disable an allocation to avoid an allocation of * regular (non-relocation data) extent. With mix of relocation * extents and regular extents, we can dispatch WRITE commands * (for relocation extents) and ZONE APPEND commands (for * regular extents) at the same time to the same zone, which * easily break the write pointer. * * Also, this flag avoids this block group to be zone finished. */ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); ctl->free_space -= num_bytes; spin_unlock(&ctl->tree_lock); /* * We do not check if found_offset is aligned to stripesize. The * address is anyway rewritten when using zone append writing. */ ffe_ctl->search_start = ffe_ctl->found_offset; out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0; spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); return ret; } static int do_allocation(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return do_allocation_clustered(block_group, ffe_ctl, bg_ret); case BTRFS_EXTENT_ALLOC_ZONED: return do_allocation_zoned(block_group, ffe_ctl, bg_ret); default: BUG(); } } static void release_block_group(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, int delalloc) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: ffe_ctl->retry_uncached = false; break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ break; default: BUG(); } BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) != ffe_ctl->index); btrfs_release_block_group(block_group, delalloc); } static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl, struct btrfs_key *ins) { struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; if (!ffe_ctl->use_cluster && last_ptr) { spin_lock(&last_ptr->lock); last_ptr->window_start = ins->objectid; spin_unlock(&last_ptr->lock); } } static void found_extent(struct find_free_extent_ctl *ffe_ctl, struct btrfs_key *ins) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: found_extent_clustered(ffe_ctl, ins); break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ break; default: BUG(); } } static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { /* Block group's activeness is not a requirement for METADATA block groups. */ if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) return 0; /* If we can activate new zone, just allocate a chunk and use it */ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) return 0; /* * We already reached the max active zones. Try to finish one block * group to make a room for a new block group. This is only possible * for a data block group because btrfs_zone_finish() may need to wait * for a running transaction which can cause a deadlock for metadata * allocation. */ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { int ret = btrfs_zone_finish_one_bg(fs_info); if (ret == 1) return 0; else if (ret < 0) return ret; } /* * If we have enough free space left in an already active block group * and we can't activate any other zone now, do not allow allocating a * new chunk and let find_free_extent() retry with a smaller size. */ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) return -ENOSPC; /* * Even min_alloc_size is not left in any block groups. Since we cannot * activate a new block group, allocating it may not help. Let's tell a * caller to try again and hope it progress something by writing some * parts of the region. That is only possible for data block groups, * where a part of the region can be written. */ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) return -EAGAIN; /* * We cannot activate a new block group and no enough space left in any * block groups. So, allocating a new block group may not help. But, * there is nothing to do anyway, so let's go with it. */ return 0; } static int can_allocate_chunk(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return 0; case BTRFS_EXTENT_ALLOC_ZONED: return can_allocate_chunk_zoned(fs_info, ffe_ctl); default: BUG(); } } /* * Return >0 means caller needs to re-search for free extent * Return 0 means we have the needed free extent. * Return <0 means we failed to locate any free extent. */ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; int ret; if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) && ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) ffe_ctl->orig_have_caching_bg = true; if (ins->objectid) { found_extent(ffe_ctl, ins); return 0; } if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) return 1; ffe_ctl->index++; if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) return 1; /* See the comments for btrfs_loop_type for an explanation of the phases. */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; /* * We want to skip the LOOP_CACHING_WAIT step if we don't have * any uncached bgs and we've already done a full search * through. */ if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; ffe_ctl->loop++; if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; /* Check if allocation policy allows to create a new chunk */ ret = can_allocate_chunk(fs_info, ffe_ctl); if (ret) return ret; trans = current->journal_info; if (trans) exist = 1; else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); return ret; } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) { ret = 0; ffe_ctl->loop++; } else if (ret < 0) btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) btrfs_end_transaction(trans); if (ret) return ret; } if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) return -ENOSPC; /* * Don't loop again if we already have no empty_size and * no empty_cluster. */ if (ffe_ctl->empty_size == 0 && ffe_ctl->empty_cluster == 0) return -ENOSPC; ffe_ctl->empty_size = 0; ffe_ctl->empty_cluster = 0; } return 1; } return -ENOSPC; } static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group *bg) { if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) return true; if (!btrfs_block_group_should_use_size_class(bg)) return true; if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) return true; if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && bg->size_class == BTRFS_BG_SZ_NONE) return true; return ffe_ctl->size_class == bg->size_class; } static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, struct btrfs_key *ins) { /* * If our free space is heavily fragmented we may not be able to make * big contiguous allocations, so instead of doing the expensive search * for free space, simply return ENOSPC with our max_extent_size so we * can go ahead and search for a more manageable chunk. * * If our max_extent_size is large enough for our allocation simply * disable clustering since we will likely not be able to find enough * space to create a cluster and induce latency trying. */ if (space_info->max_extent_size) { spin_lock(&space_info->lock); if (space_info->max_extent_size && ffe_ctl->num_bytes > space_info->max_extent_size) { ins->offset = space_info->max_extent_size; spin_unlock(&space_info->lock); return -ENOSPC; } else if (space_info->max_extent_size) { ffe_ctl->use_cluster = false; } spin_unlock(&space_info->lock); } ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info, &ffe_ctl->empty_cluster); if (ffe_ctl->last_ptr) { struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr; spin_lock(&last_ptr->lock); if (last_ptr->block_group) ffe_ctl->hint_byte = last_ptr->window_start; if (last_ptr->fragmented) { /* * We still set window_start so we can keep track of the * last place we found an allocation to try and save * some time. */ ffe_ctl->hint_byte = last_ptr->window_start; ffe_ctl->use_cluster = false; } spin_unlock(&last_ptr->lock); } return 0; } static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { if (ffe_ctl->for_treelog) { spin_lock(&fs_info->treelog_bg_lock); if (fs_info->treelog_bg) ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); } else if (ffe_ctl->for_data_reloc) { spin_lock(&fs_info->relocation_bg_lock); if (fs_info->data_reloc_bg) ffe_ctl->hint_byte = fs_info->data_reloc_bg; spin_unlock(&fs_info->relocation_bg_lock); } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { struct btrfs_block_group *block_group; spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { /* * No lock is OK here because avail is monotinically * decreasing, and this is just a hint. */ u64 avail = block_group->zone_capacity - block_group->alloc_offset; if (block_group_bits(block_group, ffe_ctl->flags) && avail >= ffe_ctl->num_bytes) { ffe_ctl->hint_byte = block_group->start; break; } } spin_unlock(&fs_info->zone_active_bgs_lock); } return 0; } static int prepare_allocation(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl, struct btrfs_space_info *space_info, struct btrfs_key *ins) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: return prepare_allocation_zoned(fs_info, ffe_ctl); default: BUG(); } } /* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY * ins->offset == the size of the hole. * Any available blocks before search_start are skipped. * * If there is no suitable free space, we will record the max size of * the free space extent currently. * * The overall logic and call chain: * * find_free_extent() * |- Iterate through all block groups * | |- Get a valid block group * | |- Try to do clustered allocation in that block group * | |- Try to do unclustered allocation in that block group * | |- Check if the result is valid * | | |- If valid, then exit * | |- Jump to next block group * | * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_root *root, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; struct btrfs_space_info *space_info; bool full_search = false; WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); ffe_ctl->search_start = 0; /* For clustered allocation */ ffe_ctl->empty_cluster = 0; ffe_ctl->last_ptr = NULL; ffe_ctl->use_cluster = true; ffe_ctl->have_caching_bg = false; ffe_ctl->orig_have_caching_bg = false; ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); ffe_ctl->loop = 0; ffe_ctl->retry_uncached = false; ffe_ctl->cached = 0; ffe_ctl->max_extent_size = 0; ffe_ctl->total_free_space = 0; ffe_ctl->found_offset = 0; ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; ffe_ctl->size_class = btrfs_calc_block_group_size_class(ffe_ctl->num_bytes); if (btrfs_is_zoned(fs_info)) ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; trace_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); return -ENOSPC; } ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); if (ret < 0) return ret; ffe_ctl->search_start = max(ffe_ctl->search_start, first_logical_byte(fs_info)); ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, ffe_ctl->search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. * * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { /* * someone is removing this block group, * we can't jump into the have_block_group * target because our list pointers are not * valid */ btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { ffe_ctl->index = btrfs_bg_flags_to_raid_index( block_group->flags); btrfs_lock_block_group(block_group, ffe_ctl->delalloc); ffe_ctl->hinted = true; goto have_block_group; } } else if (block_group) { btrfs_put_block_group(block_group); } } search: trace_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; ffe_ctl->hinted = false; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { if (ffe_ctl->for_treelog) btrfs_clear_treelog_bg(block_group); if (ffe_ctl->for_data_reloc) btrfs_clear_data_reloc_bg(block_group); continue; } btrfs_grab_block_group(block_group, ffe_ctl->delalloc); ffe_ctl->search_start = block_group->start; /* * this can happen if we end up cycling through all the * raid types, but we want to make sure we only allocate * for the proper type. */ if (!block_group_bits(block_group, ffe_ctl->flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID10; /* * if they asked for extra copies and this block group * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) goto loop; /* * This block group has different flags than we want. * It's possible that we have MIXED_GROUP flag but no * block group is mixed. Just skip such block group. */ btrfs_release_block_group(block_group, ffe_ctl->delalloc); continue; } have_block_group: trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; ret = btrfs_cache_block_group(block_group, false); /* * If we get ENOMEM here or something else we want to * try other block groups, because it may not be fatal. * However if we can't find anything else we need to * save our return here so that we return the actual * error that caused problems, not ENOSPC. */ if (ret < 0) { if (!cache_block_group_error) cache_block_group_error = ret; ret = 0; goto loop; } ret = 0; } if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) { if (!cache_block_group_error) cache_block_group_error = -EIO; goto loop; } if (!find_free_extent_check_size_class(ffe_ctl, block_group)) goto loop; bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret > 0) goto loop; if (bg_ret && bg_ret != block_group) { btrfs_release_block_group(block_group, ffe_ctl->delalloc); block_group = bg_ret; } /* Checks */ ffe_ctl->search_start = round_up(ffe_ctl->found_offset, fs_info->stripesize); /* move on to the next group */ if (ffe_ctl->search_start + ffe_ctl->num_bytes > block_group->start + block_group->length) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->num_bytes); goto loop; } if (ffe_ctl->found_offset < ffe_ctl->search_start) btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->search_start - ffe_ctl->found_offset); ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, ffe_ctl->num_bytes, ffe_ctl->delalloc, ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, ffe_ctl->found_offset, ffe_ctl->num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = ffe_ctl->search_start; ins->offset = ffe_ctl->num_bytes; trace_btrfs_reserve_extent(block_group, ffe_ctl); btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && !ffe_ctl->retry_uncached) { ffe_ctl->retry_uncached = true; btrfs_wait_block_group_cache_progress(block_group, ffe_ctl->num_bytes + ffe_ctl->empty_cluster + ffe_ctl->empty_size); goto have_block_group; } release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } up_read(&space_info->groups_sem); ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); if (ret > 0) goto search; if (ret == -ENOSPC && !cache_block_group_error) { /* * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. */ if (!ffe_ctl->max_extent_size) ffe_ctl->max_extent_size = ffe_ctl->total_free_space; spin_lock(&space_info->lock); space_info->max_extent_size = ffe_ctl->max_extent_size; spin_unlock(&space_info->lock); ins->offset = ffe_ctl->max_extent_size; } else if (ret == -ENOSPC) { ret = cache_block_group_error; } return ret; } /* * Entry point to the extent allocator. Tries to find a hole that is at least * as big as @num_bytes. * * @root - The root that will contain this extent * * @ram_bytes - The amount of space in ram that @num_bytes take. This * is used for accounting purposes. This value differs * from @num_bytes only in the case of compressed extents. * * @num_bytes - Number of bytes to allocate on-disk. * * @min_alloc_size - Indicates the minimum amount of space that the * allocator should try to satisfy. In some cases * @num_bytes may be larger than what is required and if * the filesystem is fragmented then allocation fails. * However, the presence of @min_alloc_size gives a * chance to try and satisfy the smaller allocation. * * @empty_size - A hint that you plan on doing more COW. This is the * size in bytes the allocator should try to find free * next to the block it returns. This is just a hint and * may be ignored by the allocator. * * @hint_byte - Hint to the allocator to start searching above the byte * address passed. It might be ignored. * * @ins - This key is modified to record the found hole. It will * have the following values: * ins->objectid == start position * ins->flags = BTRFS_EXTENT_ITEM_KEY * ins->offset == the size of the hole. * * @is_data - Boolean flag indicating whether an extent is * allocated for data (true) or metadata (false) * * @delalloc - Boolean flag indicating whether this allocation is for * delalloc or not. If 'true' data_rwsem of block groups * is going to be acquired. * * * Returns 0 when an allocation succeeded or < 0 when an error occurred. In * case -ENOSPC is returned then @ins->offset will contain the size of the * largest available hole the allocator managed to find. */ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct find_free_extent_ctl ffe_ctl = {}; bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; bool for_treelog = (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID); bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); ffe_ctl.ram_bytes = ram_bytes; ffe_ctl.num_bytes = num_bytes; ffe_ctl.min_alloc_size = min_alloc_size; ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; ffe_ctl.delalloc = delalloc; ffe_ctl.hint_byte = hint_byte; ffe_ctl.for_treelog = for_treelog; ffe_ctl.for_data_reloc = for_data_reloc; ret = find_free_extent(root, ins, &ffe_ctl); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, fs_info->sectorsize); num_bytes = max(num_bytes, min_alloc_size); ram_bytes = num_bytes; if (num_bytes == min_alloc_size) final_tried = true; goto again; } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); } } return ret; } int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { btrfs_err(fs_info, "Unable to find block group for %llu", start); return -ENOSPC; } btrfs_add_free_space(cache, start, len); btrfs_free_reserved_bytes(cache, len, delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); return 0; } int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret = 0; cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) { btrfs_err(trans->fs_info, "unable to find block group for %llu", eb->start); return -ENOSPC; } ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); btrfs_put_block_group(cache); return ret; } static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; ret = remove_from_free_space_tree(trans, bytenr, num_bytes); if (ret) return ret; ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); if (ret) { ASSERT(!ret); btrfs_err(fs_info, "update block group failed for %llu %llu", bytenr, num_bytes); return ret; } trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); return 0; } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod, u64 oref_root) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; size = sizeof(*extent_item); if (simple_quota) size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) return -ENOMEM; extent_root = btrfs_extent_root(fs_info, ins->objectid); ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size); if (ret) { btrfs_free_path(path); return ret; } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); btrfs_set_extent_refs(leaf, extent_item, ref_mod); btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); if (simple_quota) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); oref = (struct btrfs_extent_owner_ref *)(&iref->offset); btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); iref = (struct btrfs_extent_inline_ref *)(oref + 1); } btrfs_set_extent_inline_ref_type(leaf, iref, type); if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); btrfs_set_extent_inline_ref_offset(leaf, iref, parent); btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); } else { struct btrfs_extent_data_ref *ref; ref = (struct btrfs_extent_data_ref *)(&iref->offset); btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); btrfs_set_extent_data_ref_objectid(leaf, ref, owner); btrfs_set_extent_data_ref_offset(leaf, ref, offset); btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; struct btrfs_key extent_key; struct btrfs_tree_block_info *block_info; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); const u64 flags = (extent_op ? extent_op->flags_to_set : 0); /* The owner of a tree block is the level. */ int level = btrfs_delayed_ref_owner(node); bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); extent_key.objectid = node->bytenr; if (skinny_metadata) { /* The owner of a tree block is the level. */ extent_key.offset = level; extent_key.type = BTRFS_METADATA_ITEM_KEY; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); } path = btrfs_alloc_path(); if (!path) return -ENOMEM; extent_root = btrfs_extent_root(fs_info, extent_key.objectid); ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key, size); if (ret) { btrfs_free_path(path); return ret; } leaf = path->nodes[0]; extent_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); btrfs_set_extent_refs(leaf, extent_item, 1); btrfs_set_extent_generation(leaf, extent_item, trans->transid); btrfs_set_extent_flags(leaf, extent_item, flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, &extent_op->key); btrfs_set_tree_block_level(leaf, block_info, level); iref = (struct btrfs_extent_inline_ref *)(block_info + 1); } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent); } else { btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root); } btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { .action = BTRFS_ADD_DELAYED_EXTENT, .bytenr = ins->objectid, .num_bytes = ins->offset, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID); if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) generic_ref.owning_root = root->relocation_src_root; btrfs_init_data_ref(&generic_ref, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); } /* * this is used by the tree logging recovery code. It records that * an extent has been allocated and makes sure to clear the free * space cache bits as well */ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins) { struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_squota_delta delta = { .root = root_objectid, .num_bytes = ins->offset, .generation = trans->transid, .is_data = true, .is_inc = true, }; /* * Mixed block groups will exclude before processing the log so we only * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(fs_info, ins->objectid, ins->offset); if (ret) return ret; } block_group = btrfs_lookup_block_group(fs_info, ins->objectid); if (!block_group) return -EINVAL; space_info = block_group->space_info; spin_lock(&space_info->lock); spin_lock(&block_group->lock); space_info->bytes_reserved += ins->offset; block_group->reserved += ins->offset; spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1, root_objectid); if (ret) btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; } #ifdef CONFIG_BTRFS_DEBUG /* * Extra safety check in case the extent tree is corrupted and extent allocator * chooses to use a tree block which is already used and locked. */ static bool check_eb_lock_owner(const struct extent_buffer *eb) { if (eb->lock_owner == current->pid) { btrfs_err_rl(eb->fs_info, "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", eb->start, btrfs_header_owner(eb), current->pid); return true; } return false; } #else static bool check_eb_lock_owner(struct extent_buffer *eb) { return false; } #endif static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; u64 lockdep_owner = owner; buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level); if (IS_ERR(buf)) return buf; if (check_eb_lock_owner(buf)) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } /* * The reloc trees are just snapshots, so we need them to appear to be * just like any other fs tree WRT lockdep. * * The exception however is in replace_path() in relocation, where we * hold the lock on the original fs root and then search for the reloc * root. At that point we need to make sure any reloc root buffers are * set to the BTRFS_TREE_RELOC_OBJECTID lockdep class in order to make * lockdep happy. */ if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID && !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; /* btrfs_clear_buffer_dirty() accesses generation field. */ btrfs_set_header_generation(buf, trans->transid); /* * This needs to stay, because we could allocate a freed block from an * old tree into a new tree, so we need to make sure this new block is * set to the appropriate level and owner. */ btrfs_set_buffer_lockdep_class(lockdep_owner, buf, level); btrfs_tree_lock_nested(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); set_extent_buffer_uptodate(buf); memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(buf, level); btrfs_set_header_bytenr(buf, buf->start); btrfs_set_header_generation(buf, trans->transid); btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(buf, owner); write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid); write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid); if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, EXTENT_DIRTY, NULL); else set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, EXTENT_NEW, NULL); } else { buf->log_index = -1; set_extent_bit(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; } /* * finds a free extent and does all the dirty work required for allocation * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, u64 reloc_src_root, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; u64 flags = 0; int ret; u32 blocksize = fs_info->nodesize; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); u64 owning_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } #endif block_rsv = btrfs_use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, empty_size, hint, &ins, 0, 0); if (ret) goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; } owning_root = btrfs_header_owner(buf); if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; owning_root = reloc_src_root; } else BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; struct btrfs_ref generic_ref = { .action = BTRFS_ADD_DELAYED_EXTENT, .bytenr = ins.objectid, .num_bytes = ins.offset, .parent = parent, .owning_root = owning_root, .ref_root = root_objectid, }; if (!skinny_metadata || flags != 0) { extent_op = btrfs_alloc_delayed_extent_op(); if (!extent_op) { ret = -ENOMEM; goto out_free_buf; } if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; extent_op->update_key = (skinny_metadata ? false : true); extent_op->update_flags = (flags != 0); } else { extent_op = NULL; } btrfs_init_tree_ref(&generic_ref, level, btrfs_root_id(root), false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) { btrfs_free_delayed_extent_op(extent_op); goto out_free_buf; } } return buf; out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); } struct walk_control { u64 refs[BTRFS_MAX_LEVEL]; u64 flags[BTRFS_MAX_LEVEL]; struct btrfs_key update_progress; struct btrfs_key drop_progress; int drop_level; int stage; int level; int shared_level; int update_ref; int keep_locks; int reada_slot; int reada_count; int restarted; /* Indicate that extent info needs to be looked up when walking the tree. */ int lookup_info; }; /* * This is our normal stage. We are traversing blocks the current snapshot owns * and we are dropping any of our references to any children we are able to, and * then freeing the block once we've processed all of the children. */ #define DROP_REFERENCE 1 /* * We enter this stage when we have to walk into a child block (meaning we can't * simply drop our reference to it from our current parent node) and there are * more than one reference on it. If we are the owner of any of the children * blocks from the current parent node then we have to do the FULL_BACKREF dance * on them in order to drop our normal ref and add the shared ref. */ #define UPDATE_BACKREF 2 /* * Decide if we need to walk down into this node to adjust the references. * * @root: the root we are currently deleting * @wc: the walk control for this deletion * @eb: the parent eb that we're currently visiting * @refs: the number of refs for wc->level - 1 * @flags: the flags for wc->level - 1 * @slot: the slot in the eb that we're currently checking * * This is meant to be called when we're evaluating if a node we point to at * wc->level should be read and walked into, or if we can simply delete our * reference to it. We return true if we should walk into the node, false if we * can skip it. * * We have assertions in here to make sure this is called correctly. We assume * that sanity checking on the blocks read to this point has been done, so any * corrupted file systems must have been caught before calling this function. */ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc, struct extent_buffer *eb, u64 flags, int slot) { struct btrfs_key key; u64 generation; int level = wc->level; ASSERT(level > 0); ASSERT(wc->refs[level - 1] > 0); /* * The update backref stage we only want to skip if we already have * FULL_BACKREF set, otherwise we need to read. */ if (wc->stage == UPDATE_BACKREF) { if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) return false; return true; } /* * We're the last ref on this block, we must walk into it and process * any refs it's pointing at. */ if (wc->refs[level - 1] == 1) return true; /* * If we're already FULL_BACKREF then we know we can just drop our * current reference. */ if (level == 1 && flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) return false; /* * This block is older than our creation generation, we can drop our * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* * This block was processed from a previous snapshot deletion run, we * can skip it. */ btrfs_node_key_to_cpu(eb, &key, slot); if (btrfs_comp_cpu_keys(&key, &wc->update_progress) < 0) return false; /* All other cases we need to wander into the node. */ return true; } static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct walk_control *wc, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; u64 refs; u64 flags; u32 nritems; struct extent_buffer *eb; int ret; int slot; int nread = 0; if (path->slots[wc->level] < wc->reada_slot) { wc->reada_count = wc->reada_count * 2 / 3; wc->reada_count = max(wc->reada_count, 2); } else { wc->reada_count = wc->reada_count * 3 / 2; wc->reada_count = min_t(int, wc->reada_count, BTRFS_NODEPTRS_PER_BLOCK(fs_info)); } eb = path->nodes[wc->level]; nritems = btrfs_header_nritems(eb); for (slot = path->slots[wc->level]; slot < nritems; slot++) { if (nread >= wc->reada_count) break; cond_resched(); bytenr = btrfs_node_blockptr(eb, slot); generation = btrfs_node_ptr_generation(eb, slot); if (slot == path->slots[wc->level]) goto reada; if (wc->stage == UPDATE_BACKREF && generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; /* * This could be racey, it's conceivable that we raced and end * up with a bogus refs count, if that's the case just skip, if * we are actually corrupt we will notice when we look up * everything again with our locks. */ if (refs == 0) continue; /* If we don't need to visit this node don't reada. */ if (!visit_node_for_delete(root, wc, eb, flags, slot)) continue; reada: btrfs_readahead_node_child(eb, slot); nread++; } wc->reada_slot = slot; } /* * helper to process tree block while walking down the tree. * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. * * NOTE: return value 1 means we should stop walking down. */ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; if (wc->stage == UPDATE_BACKREF && btrfs_header_owner(eb) != btrfs_root_id(root)) return 1; /* * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ if (wc->lookup_info && ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { ASSERT(path->locks[level]); ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); if (ret) return ret; if (unlikely(wc->refs[level] == 0)) { btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", eb->start); return -EUCLEAN; } } if (wc->stage == DROP_REFERENCE) { if (wc->refs[level] > 1) return 1; if (path->locks[level] && !wc->keep_locks) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; } /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { ASSERT(path->locks[level]); ret = btrfs_inc_ref(trans, root, eb, 1); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_dec_ref(trans, root, eb, 0); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = btrfs_set_disk_extent_flags(trans, eb, flag); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } wc->flags[level] |= flag; } /* * the block is shared by multiple trees, so it's not good to * keep the tree lock */ if (path->locks[level] && level > 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; } return 0; } /* * This is used to verify a ref exists for this root to deal with a bug where we * would have a drop_progress key that hadn't been updated properly. */ static int check_ref_exists(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 parent, int level) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: ret = lookup_extent_backref(trans, path, &iref, bytenr, root->fs_info->nodesize, parent, btrfs_root_id(root), level, 0); if (ret != -ENOENT) { /* * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ return (ret < 0 ) ? ret : 1; } /* * We could have a delayed ref with this reference, so look it up while * we're holding the path open to make sure we don't race with the * delayed ref running. */ delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr); if (!head) goto out; if (!mutex_trylock(&head->mutex)) { /* * We're contended, means that the delayed ref is running, get a * reference and wait for the ref head to be complete and then * try again. */ refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); btrfs_put_delayed_ref_head(head); goto again; } exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); return exists ? 1 : 0; } /* * We may not have an uptodate block, so if we are going to walk down into this * block we need to drop the lock, read it off of the disk, re-lock it and * return to continue dropping the snapshot. */ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, struct extent_buffer *next) { struct btrfs_tree_parent_check check = { 0 }; u64 generation; int level = wc->level; int ret; btrfs_assert_tree_write_locked(next); generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); if (btrfs_buffer_uptodate(next, generation, 0)) return 0; check.level = level - 1; check.transid = generation; check.owner_root = btrfs_root_id(root); check.has_first_key = true; btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); btrfs_tree_unlock(next); if (level == 1) reada_walk_down(trans, root, wc, path); ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; } btrfs_tree_lock(next); wc->lookup_info = 1; return 0; } /* * If we determine that we don't have to visit wc->level - 1 then we need to * determine if we can drop our reference. * * If we are UPDATE_BACKREF then we will not, we need to update our backrefs. * * If we are DROP_REFERENCE this will figure out if we need to drop our current * reference, skipping it if we dropped it from a previous incompleted drop, or * dropping it if we still have a reference to it. */ static int maybe_drop_reference(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, struct extent_buffer *next, u64 owner_root) { struct btrfs_ref ref = { .action = BTRFS_DROP_DELAYED_REF, .bytenr = next->start, .num_bytes = root->fs_info->nodesize, .owning_root = owner_root, .ref_root = btrfs_root_id(root), }; int level = wc->level; int ret; /* We are UPDATE_BACKREF, we're not dropping anything. */ if (wc->stage == UPDATE_BACKREF) return 0; if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { ref.parent = path->nodes[level]->start; } else { ASSERT(btrfs_root_id(root) == btrfs_header_owner(path->nodes[level])); if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level])) { btrfs_err(root->fs_info, "mismatched block owner"); return -EIO; } } /* * If we had a drop_progress we need to verify the refs are set as * expected. If we find our ref then we know that from here on out * everything should be correct, and we can clear the * ->restarted flag. */ if (wc->restarted) { ret = check_ref_exists(trans, root, next->start, ref.parent, level - 1); if (ret <= 0) return ret; ret = 0; wc->restarted = 0; } /* * Reloc tree doesn't contribute to qgroup numbers, and we have already * accounted them at merge time (replace_path), thus we could skip * expensive subtree trace here. */ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID && wc->refs[level - 1] > 1) { u64 generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); ret = btrfs_qgroup_trace_subtree(trans, next, generation, level - 1); if (ret) { btrfs_err_rl(root->fs_info, "error %d accounting shared subtree, quota is out of sync, rescan required", ret); } } /* * We need to update the next key in our walk control so we can update * the drop_progress key accordingly. We don't care if find_next_key * doesn't find a key because that means we're at the end and are going * to clean up now. */ wc->drop_level = level; find_next_key(path, level, &wc->drop_progress); btrfs_init_tree_ref(&ref, level - 1, 0, false); return btrfs_free_extent(trans, &ref); } /* * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks * reference count of the block pointed to. if the block * is shared and we need update back refs for the subtree * rooted at the block, this function changes wc->stage to * UPDATE_BACKREF. if the block is shared and there is no * need to update back, this function drops the reference * to the block. * * NOTE: return value 1 means we should stop walking down. */ static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; u64 owner_root = 0; struct extent_buffer *next; int level = wc->level; int ret = 0; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); /* * if the lower level block was created before the snapshot * was created, we know there is no need to update back refs * for the subtree */ if (wc->stage == UPDATE_BACKREF && generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_root_id(root), level - 1); if (IS_ERR(next)) return PTR_ERR(next); btrfs_tree_lock(next); ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1], &owner_root); if (ret < 0) goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", bytenr); ret = -EUCLEAN; goto out_unlock; } wc->lookup_info = 0; /* If we don't have to walk into this node skip it. */ if (!visit_node_for_delete(root, wc, path->nodes[level], wc->flags[level - 1], path->slots[level])) goto skip; /* * We have to walk down into this node, and if we're currently at the * DROP_REFERNCE stage and this block is shared then we need to switch * to the UPDATE_BACKREF stage in order to convert to FULL_BACKREF. */ if (wc->stage == DROP_REFERENCE && wc->refs[level - 1] > 1) { wc->stage = UPDATE_BACKREF; wc->shared_level = level - 1; } ret = check_next_block_uptodate(trans, root, path, wc, next); if (ret) return ret; level--; ASSERT(level == btrfs_header_level(next)); if (level != btrfs_header_level(next)) { btrfs_err(root->fs_info, "mismatched level"); ret = -EIO; goto out_unlock; } path->nodes[level] = next; path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK; wc->level = level; if (wc->level == 1) wc->reada_slot = 0; return 0; skip: ret = maybe_drop_reference(trans, root, path, wc, next, owner_root); if (ret) goto out_unlock; wc->refs[level - 1] = 0; wc->flags[level - 1] = 0; wc->lookup_info = 1; ret = 1; out_unlock: btrfs_tree_unlock(next); free_extent_buffer(next); return ret; } /* * helper to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops * reference count on the block. * * when wc->stage == UPDATE_BACKREF, this function changes * wc->stage back to DROP_REFERENCE if we changed wc->stage * to UPDATE_BACKREF previously while processing the block. * * NOTE: return value 1 means we should stop walking up. */ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; if (wc->stage == UPDATE_BACKREF) { ASSERT(wc->shared_level >= level); if (level < wc->shared_level) goto out; ret = find_next_key(path, level + 1, &wc->update_progress); if (ret > 0) wc->update_ref = 0; wc->stage = DROP_REFERENCE; wc->shared_level = -1; path->slots[level] = 0; /* * check reference count again if the block isn't locked. * we should start walking down the tree again if reference * count is one. */ if (!path->locks[level]) { ASSERT(level > 0); btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; return ret; } if (unlikely(wc->refs[level] == 0)) { btrfs_tree_unlock_rw(eb, path->locks[level]); btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0", eb->start); return -EUCLEAN; } if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; return 1; } } } /* wc->stage == DROP_REFERENCE */ ASSERT(path->locks[level] || wc->refs[level] == 1); if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) ret = btrfs_dec_ref(trans, root, eb, 1); else ret = btrfs_dec_ref(trans, root, eb, 0); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } if (is_fstree(btrfs_root_id(root))) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { btrfs_err_rl(fs_info, "error %d accounting leaf items, quota is out of sync, rescan required", ret); } } } /* Make block locked assertion in btrfs_clear_buffer_dirty happy. */ if (!path->locks[level]) { btrfs_tree_lock(eb); path->locks[level] = BTRFS_WRITE_LOCK; } btrfs_clear_buffer_dirty(trans, eb); } if (eb == root->node) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = eb->start; else if (btrfs_root_id(root) != btrfs_header_owner(eb)) goto owner_mismatch; } else { if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) parent = path->nodes[level + 1]->start; else if (btrfs_root_id(root) != btrfs_header_owner(path->nodes[level + 1])) goto owner_mismatch; } ret = btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent, wc->refs[level] == 1); if (ret < 0) btrfs_abort_transaction(trans, ret); out: wc->refs[level] = 0; wc->flags[level] = 0; return ret; owner_mismatch: btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", btrfs_header_owner(eb), btrfs_root_id(root)); return -EUCLEAN; } /* * walk_down_tree consists of two steps. * * walk_down_proc(). Look up the reference count and reference of our current * wc->level. At this point path->nodes[wc->level] should be populated and * uptodate, and in most cases should already be locked. If we are in * DROP_REFERENCE and our refcount is > 1 then we've entered a shared node and * we can walk back up the tree. If we are UPDATE_BACKREF we have to set * FULL_BACKREF on this node if it's not already set, and then do the * FULL_BACKREF conversion dance, which is to drop the root reference and add * the shared reference to all of this nodes children. * * do_walk_down(). This is where we actually start iterating on the children of * our current path->nodes[wc->level]. For DROP_REFERENCE that means dropping * our reference to the children that return false from visit_node_for_delete(), * which has various conditions where we know we can just drop our reference * without visiting the node. For UPDATE_BACKREF we will skip any children that * visit_node_for_delete() returns false for, only walking down when necessary. * The bulk of the work for UPDATE_BACKREF occurs in the walk_up_tree() part of * snapshot deletion. */ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc) { int level = wc->level; int ret = 0; wc->lookup_info = 1; while (level >= 0) { ret = walk_down_proc(trans, root, path, wc); if (ret) break; if (level == 0) break; if (path->slots[level] >= btrfs_header_nritems(path->nodes[level])) break; ret = do_walk_down(trans, root, path, wc); if (ret > 0) { path->slots[level]++; continue; } else if (ret < 0) break; level = wc->level; } return (ret == 1) ? 0 : ret; } /* * walk_up_tree() is responsible for making sure we visit every slot on our * current node, and if we're at the end of that node then we call * walk_up_proc() on our current node which will do one of a few things based on * our stage. * * UPDATE_BACKREF. If we wc->level is currently less than our wc->shared_level * then we need to walk back up the tree, and then going back down into the * other slots via walk_down_tree to update any other children from our original * wc->shared_level. Once we're at or above our wc->shared_level we can switch * back to DROP_REFERENCE, lookup the current nodes refs and flags, and carry on. * * DROP_REFERENCE. If our refs == 1 then we're going to free this tree block. * If we're level 0 then we need to btrfs_dec_ref() on all of the data extents * in our current leaf. After that we call btrfs_free_tree_block() on the * current node and walk up to the next node to walk down the next slot. */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct walk_control *wc, int max_level) { int level = wc->level; int ret; path->slots[level] = btrfs_header_nritems(path->nodes[level]); while (level < max_level && path->nodes[level]) { wc->level = level; if (path->slots[level] + 1 < btrfs_header_nritems(path->nodes[level])) { path->slots[level]++; return 0; } else { ret = walk_up_proc(trans, root, path, wc); if (ret > 0) return 0; if (ret < 0) return ret; if (path->locks[level]) { btrfs_tree_unlock_rw(path->nodes[level], path->locks[level]); path->locks[level] = 0; } free_extent_buffer(path->nodes[level]); path->nodes[level] = NULL; level++; } } return 1; } /* * drop a subvolume tree. * * this function traverses the tree freeing any blocks that only * referenced by the tree. * * when a shared tree block is found. this function decreases its * reference count by one. if update_ref is true, this function * also make sure backrefs for the shared block and all lower level * blocks are properly updated. * * If called with for_reloc == 0, may exit early with -EAGAIN */ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) { const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; struct walk_control *wc; struct btrfs_key key; const u64 rootid = btrfs_root_id(root); int ret = 0; int level; bool root_dropped = false; bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", btrfs_root_id(root)); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { btrfs_free_path(path); ret = -ENOMEM; goto out; } /* * Use join to avoid potential EINTR from transaction start. See * wait_reserve_ticket and the whole reservation callchain. */ if (for_reloc) trans = btrfs_join_transaction(tree_root); else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_free; } ret = btrfs_run_delayed_items(trans); if (ret) goto out_end_trans; /* * This will help us catch people modifying the fs tree while we're * dropping it. It is unsafe to mess with the fs tree while it's being * dropped as we unlock the root node and parent nodes as we walk down * the tree, assuming nothing will change. If something does change * then we'll have stale information and drop references to blocks we've * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK; memset(&wc->update_progress, 0, sizeof(wc->update_progress)); } else { btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); memcpy(&wc->update_progress, &key, sizeof(wc->update_progress)); level = btrfs_root_drop_level(root_item); BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->lowest_level = 0; if (ret < 0) goto out_end_trans; WARN_ON(ret > 0); ret = 0; /* * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ btrfs_unlock_up_safe(path, 0); level = btrfs_header_level(root->node); while (1) { btrfs_tree_lock(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK; /* * btrfs_lookup_extent_info() returns 0 for success, * or < 0 for error. */ ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], &wc->flags[level], NULL); if (ret < 0) goto out_end_trans; BUG_ON(wc->refs[level] == 0); if (level == btrfs_root_drop_level(root_item)) break; btrfs_tree_unlock(path->nodes[level]); path->locks[level] = 0; WARN_ON(wc->refs[level] != 1); level--; } } wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); wc->level = level; wc->shared_level = -1; wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { ret = walk_down_tree(trans, root, path, wc); if (ret < 0) { btrfs_abort_transaction(trans, ret); break; } ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); if (ret < 0) { btrfs_abort_transaction(trans, ret); break; } if (ret > 0) { BUG_ON(wc->stage != DROP_REFERENCE); ret = 0; break; } if (wc->stage == DROP_REFERENCE) { wc->drop_level = wc->level; btrfs_node_key_to_cpu(path->nodes[wc->drop_level], &wc->drop_progress, path->slots[wc->drop_level]); } btrfs_cpu_key_to_disk(&root_item->drop_progress, &wc->drop_progress); btrfs_set_root_drop_level(root_item, wc->drop_level); BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } if (!is_reloc_root) btrfs_set_last_root_drop_gen(fs_info, trans->transid); btrfs_end_transaction_throttle(trans); if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, "drop snapshot early exit"); ret = -EAGAIN; goto out_free; } /* * Use join to avoid potential EINTR from transaction * start. See wait_reserve_ticket and the whole * reservation callchain. */ if (for_reloc) trans = btrfs_join_transaction(tree_root); else trans = btrfs_start_transaction(tree_root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_free; } } } btrfs_release_path(path); if (ret) goto out_end_trans; ret = btrfs_del_root(trans, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } if (!is_reloc_root) { ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } else if (ret > 0) { ret = 0; /* * If we fail to delete the orphan item this time * around, it'll get picked up the next time. * * The most common failure here is just -ENOENT. */ btrfs_del_orphan_item(trans, tree_root, btrfs_root_id(root)); } } /* * This subvolume is going to be completely dropped, and won't be * recorded as dirty roots, thus pertrans meta rsv will not be freed at * commit transaction time. So free it here manually. */ btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); root_dropped = true; out_end_trans: if (!is_reloc_root) btrfs_set_last_root_drop_gen(fs_info, trans->transid); btrfs_end_transaction_throttle(trans); out_free: kfree(wc); btrfs_free_path(path); out: if (!ret && root_dropped) { ret = btrfs_qgroup_cleanup_dropped_subvolume(fs_info, rootid); if (ret < 0) btrfs_warn_rl(fs_info, "failed to cleanup qgroup 0/%llu: %d", rootid, ret); ret = 0; } /* * We were an unfinished drop root, check to see if there are any * pending, and if not clear and wake up any waiters. */ if (!ret && unfinished_drop) btrfs_maybe_wake_unfinished_drop(fs_info); /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we * keep trying to do the work later. This also cleans up roots if we * don't have it in the radix (like when we recover after a power fail * or unmount) so we don't leak memory. */ if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); return ret; } /* * drop subtree rooted at tree block 'node'. * * NOTE: this function will unlock and release tree block 'node' * only used by relocation code */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; int ret = 0; BUG_ON(btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); if (!path) return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) return -ENOMEM; btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); btrfs_assert_tree_write_locked(node); level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; path->locks[level] = BTRFS_WRITE_LOCK; wc->refs[parent_level] = 1; wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; wc->level = level; wc->shared_level = -1; wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { ret = walk_down_tree(trans, root, path, wc); if (ret < 0) break; ret = walk_up_tree(trans, root, path, wc, parent_level); if (ret) { if (ret > 0) ret = 0; break; } } kfree(wc); return ret; } /* * Unpin the extent range in an error context and don't add the space back. * Errors are not propagated further. */ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { unpin_extent_range(fs_info, start, end, false); } /* * It used to be that old block groups would be left around forever. * Iterating over them would be enough to trim unused space. Since we * now automatically remove them, we also need to iterate over unallocated * space. * * We don't want a transaction for this since the discard may take a * substantial amount of time. We don't require that a transaction be * running, but we do need to take a running transaction into account * to ensure that we're not discarding chunks that were released or * allocated in the current transaction. * * Holding the chunks lock will prevent other threads from allocating * or releasing chunks, but it won't prevent a running transaction * from committing and releasing the memory that the pending chunks * list head uses. For that, we need to take a reference to the * transaction and hold the commit root sem. We only need to hold * it while performing the free space search since we have already * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; /* Discard not supported = nothing to do. */ if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return 0; /* No free space = nothing to do. */ if (device->total_bytes <= device->bytes_used) return 0; ret = 0; while (1) { struct btrfs_fs_info *fs_info = device->fs_info; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, btrfs_dev_name(device), device->total_bytes); mutex_unlock(&fs_info->chunk_mutex); ret = 0; break; } /* Ensure we skip the reserved space on each device. */ start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the * end of the device it will set end to -1, in this case it's up * to the caller to trim the value to the size of the device. */ end = min(end, device->total_bytes - 1); len = end - start + 1; /* We didn't find any extents */ if (!len) { mutex_unlock(&fs_info->chunk_mutex); ret = 0; break; } ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) set_extent_bit(&device->alloc_state, start, start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) break; start += len; *trimmed += bytes; if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; break; } cond_resched(); } return ret; } /* * Trim the whole filesystem by: * 1) trimming the free space in each block group * 2) trimming the unallocated space on each device * * This will also continue trimming even if a block group or device encounters * an error. The return value will be the last error, or 0 if nothing bad * happens. */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; struct btrfs_device *device; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; u64 end; u64 trimmed = 0; u64 bg_failed = 0; u64 dev_failed = 0; int bg_ret = 0; int dev_ret = 0; int ret = 0; if (range->start == U64_MAX) return -EINVAL; /* * Check range overflow if range->len is set. * The default range->len is U64_MAX. */ if (range->len != U64_MAX && check_add_overflow(range->start, range->len, &range_end)) return -EINVAL; cache = btrfs_lookup_first_block_group(fs_info, range->start); for (; cache; cache = btrfs_next_block_group(cache)) { if (cache->start >= range_end) { btrfs_put_block_group(cache); break; } start = max(range->start, cache->start); end = min(range_end, cache->start + cache->length); if (end - start >= range->minlen) { if (!btrfs_block_group_done(cache)) { ret = btrfs_cache_block_group(cache, true); if (ret) { bg_failed++; bg_ret = ret; continue; } } ret = btrfs_trim_block_group(cache, &group_trimmed, start, end, range->minlen); trimmed += group_trimmed; if (ret) { bg_failed++; bg_ret = ret; continue; } } } if (bg_failed) btrfs_warn(fs_info, "failed to trim %llu block group(s), last error %d", bg_failed, bg_ret); mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) continue; ret = btrfs_trim_free_extents(device, &group_trimmed); trimmed += group_trimmed; if (ret) { dev_failed++; dev_ret = ret; break; } } mutex_unlock(&fs_devices->device_list_mutex); if (dev_failed) btrfs_warn(fs_info, "failed to trim %llu device(s), last error %d", dev_failed, dev_ret); range->len = trimmed; if (bg_ret) return bg_ret; return dev_ret; } |
2 2 2 3 3 3 1 3 3 3 1 1 1 2 2 2 2 2 1 1 1 1 14 14 14 4 3 3 13 13 14 10 12 14 14 14 12 9 8 8 12 10 12 12 14 14 14 14 14 14 14 14 14 14 15 14 14 14 14 14 1 14 14 15 9 9 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 | /* * Copyright (c) 2010-2011 Atheros Communications Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <linux/unaligned.h> #include "htc.h" MODULE_FIRMWARE(HTC_7010_MODULE_FW); MODULE_FIRMWARE(HTC_9271_MODULE_FW); static const struct usb_device_id ath9k_hif_usb_ids[] = { { USB_DEVICE(0x0cf3, 0x9271) }, /* Atheros */ { USB_DEVICE(0x0cf3, 0x1006) }, /* Atheros */ { USB_DEVICE(0x0846, 0x9030) }, /* Netgear N150 */ { USB_DEVICE(0x07b8, 0x9271) }, /* Altai WA1011N-GU */ { USB_DEVICE(0x07D1, 0x3A10) }, /* Dlink Wireless 150 */ { USB_DEVICE(0x13D3, 0x3327) }, /* Azurewave */ { USB_DEVICE(0x13D3, 0x3328) }, /* Azurewave */ { USB_DEVICE(0x13D3, 0x3346) }, /* IMC Networks */ { USB_DEVICE(0x13D3, 0x3348) }, /* Azurewave */ { USB_DEVICE(0x13D3, 0x3349) }, /* Azurewave */ { USB_DEVICE(0x13D3, 0x3350) }, /* Azurewave */ { USB_DEVICE(0x04CA, 0x4605) }, /* Liteon */ { USB_DEVICE(0x040D, 0x3801) }, /* VIA */ { USB_DEVICE(0x0cf3, 0xb003) }, /* Ubiquiti WifiStation Ext */ { USB_DEVICE(0x0cf3, 0xb002) }, /* Ubiquiti WifiStation */ { USB_DEVICE(0x057c, 0x8403) }, /* AVM FRITZ!WLAN 11N v2 USB */ { USB_DEVICE(0x0471, 0x209e) }, /* Philips (or NXP) PTA01 */ { USB_DEVICE(0x1eda, 0x2315) }, /* AirTies */ { USB_DEVICE(0x0cf3, 0x7015), .driver_info = AR9287_USB }, /* Atheros */ { USB_DEVICE(0x0cf3, 0x7010), .driver_info = AR9280_USB }, /* Atheros */ { USB_DEVICE(0x0846, 0x9018), .driver_info = AR9280_USB }, /* Netgear WNDA3200 */ { USB_DEVICE(0x083A, 0xA704), .driver_info = AR9280_USB }, /* SMC Networks */ { USB_DEVICE(0x0411, 0x017f), .driver_info = AR9280_USB }, /* Sony UWA-BR100 */ { USB_DEVICE(0x0411, 0x0197), .driver_info = AR9280_USB }, /* Buffalo WLI-UV-AG300P */ { USB_DEVICE(0x04da, 0x3904), .driver_info = AR9280_USB }, { USB_DEVICE(0x0930, 0x0a08), .driver_info = AR9280_USB }, /* Toshiba WLM-20U2 and GN-1080 */ { USB_DEVICE(0x0cf3, 0x20ff), .driver_info = STORAGE_DEVICE }, { }, }; MODULE_DEVICE_TABLE(usb, ath9k_hif_usb_ids); static int __hif_usb_tx(struct hif_device_usb *hif_dev); static void hif_usb_regout_cb(struct urb *urb) { struct cmd_buf *cmd = urb->context; switch (urb->status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ENODEV: case -ESHUTDOWN: goto free; default: break; } if (cmd) { ath9k_htc_txcompletion_cb(cmd->hif_dev->htc_handle, cmd->skb, true); kfree(cmd); } return; free: kfree_skb(cmd->skb); kfree(cmd); } static int hif_usb_send_regout(struct hif_device_usb *hif_dev, struct sk_buff *skb) { struct urb *urb; struct cmd_buf *cmd; int ret = 0; urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) return -ENOMEM; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (cmd == NULL) { usb_free_urb(urb); return -ENOMEM; } cmd->skb = skb; cmd->hif_dev = hif_dev; usb_fill_int_urb(urb, hif_dev->udev, usb_sndintpipe(hif_dev->udev, USB_REG_OUT_PIPE), skb->data, skb->len, hif_usb_regout_cb, cmd, 1); usb_anchor_urb(urb, &hif_dev->regout_submitted); ret = usb_submit_urb(urb, GFP_KERNEL); if (ret) { usb_unanchor_urb(urb); kfree(cmd); } usb_free_urb(urb); return ret; } static void hif_usb_mgmt_cb(struct urb *urb) { struct cmd_buf *cmd = urb->context; struct hif_device_usb *hif_dev; unsigned long flags; bool txok = true; if (!cmd || !cmd->skb || !cmd->hif_dev) return; hif_dev = cmd->hif_dev; switch (urb->status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ENODEV: case -ESHUTDOWN: txok = false; /* * If the URBs are being flushed, no need to complete * this packet. */ spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); if (hif_dev->tx.flags & HIF_USB_TX_FLUSH) { spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); dev_kfree_skb_any(cmd->skb); kfree(cmd); return; } spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); break; default: txok = false; break; } skb_pull(cmd->skb, 4); ath9k_htc_txcompletion_cb(cmd->hif_dev->htc_handle, cmd->skb, txok); kfree(cmd); } static int hif_usb_send_mgmt(struct hif_device_usb *hif_dev, struct sk_buff *skb) { struct urb *urb; struct cmd_buf *cmd; int ret = 0; __le16 *hdr; urb = usb_alloc_urb(0, GFP_ATOMIC); if (urb == NULL) return -ENOMEM; cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC); if (cmd == NULL) { usb_free_urb(urb); return -ENOMEM; } cmd->skb = skb; cmd->hif_dev = hif_dev; hdr = skb_push(skb, 4); *hdr++ = cpu_to_le16(skb->len - 4); *hdr++ = cpu_to_le16(ATH_USB_TX_STREAM_MODE_TAG); usb_fill_bulk_urb(urb, hif_dev->udev, usb_sndbulkpipe(hif_dev->udev, USB_WLAN_TX_PIPE), skb->data, skb->len, hif_usb_mgmt_cb, cmd); usb_anchor_urb(urb, &hif_dev->mgmt_submitted); ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret) { usb_unanchor_urb(urb); kfree(cmd); } usb_free_urb(urb); return ret; } static inline void ath9k_skb_queue_purge(struct hif_device_usb *hif_dev, struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = __skb_dequeue(list)) != NULL) { dev_kfree_skb_any(skb); } } static inline void ath9k_skb_queue_complete(struct hif_device_usb *hif_dev, struct sk_buff_head *queue, bool txok) { struct sk_buff *skb; while ((skb = __skb_dequeue(queue)) != NULL) { #ifdef CONFIG_ATH9K_HTC_DEBUGFS int ln = skb->len; #endif ath9k_htc_txcompletion_cb(hif_dev->htc_handle, skb, txok); if (txok) { TX_STAT_INC(hif_dev, skb_success); TX_STAT_ADD(hif_dev, skb_success_bytes, ln); } else TX_STAT_INC(hif_dev, skb_failed); } } static void hif_usb_tx_cb(struct urb *urb) { struct tx_buf *tx_buf = urb->context; struct hif_device_usb *hif_dev; bool txok = true; if (!tx_buf || !tx_buf->hif_dev) return; hif_dev = tx_buf->hif_dev; switch (urb->status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ENODEV: case -ESHUTDOWN: txok = false; /* * If the URBs are being flushed, no need to add this * URB to the free list. */ spin_lock(&hif_dev->tx.tx_lock); if (hif_dev->tx.flags & HIF_USB_TX_FLUSH) { spin_unlock(&hif_dev->tx.tx_lock); ath9k_skb_queue_purge(hif_dev, &tx_buf->skb_queue); return; } spin_unlock(&hif_dev->tx.tx_lock); break; default: txok = false; break; } ath9k_skb_queue_complete(hif_dev, &tx_buf->skb_queue, txok); /* Re-initialize the SKB queue */ tx_buf->len = tx_buf->offset = 0; __skb_queue_head_init(&tx_buf->skb_queue); /* Add this TX buffer to the free list */ spin_lock(&hif_dev->tx.tx_lock); list_move_tail(&tx_buf->list, &hif_dev->tx.tx_buf); hif_dev->tx.tx_buf_cnt++; if (!(hif_dev->tx.flags & HIF_USB_TX_STOP)) __hif_usb_tx(hif_dev); /* Check for pending SKBs */ TX_STAT_INC(hif_dev, buf_completed); spin_unlock(&hif_dev->tx.tx_lock); } /* TX lock has to be taken */ static int __hif_usb_tx(struct hif_device_usb *hif_dev) { struct tx_buf *tx_buf = NULL; struct sk_buff *nskb = NULL; int ret = 0, i; u16 tx_skb_cnt = 0; u8 *buf; __le16 *hdr; if (hif_dev->tx.tx_skb_cnt == 0) return 0; /* Check if a free TX buffer is available */ if (list_empty(&hif_dev->tx.tx_buf)) return 0; tx_buf = list_first_entry(&hif_dev->tx.tx_buf, struct tx_buf, list); list_move_tail(&tx_buf->list, &hif_dev->tx.tx_pending); hif_dev->tx.tx_buf_cnt--; tx_skb_cnt = min_t(u16, hif_dev->tx.tx_skb_cnt, MAX_TX_AGGR_NUM); for (i = 0; i < tx_skb_cnt; i++) { nskb = __skb_dequeue(&hif_dev->tx.tx_skb_queue); /* Should never be NULL */ BUG_ON(!nskb); hif_dev->tx.tx_skb_cnt--; buf = tx_buf->buf; buf += tx_buf->offset; hdr = (__le16 *)buf; *hdr++ = cpu_to_le16(nskb->len); *hdr++ = cpu_to_le16(ATH_USB_TX_STREAM_MODE_TAG); buf += 4; memcpy(buf, nskb->data, nskb->len); tx_buf->len = nskb->len + 4; if (i < (tx_skb_cnt - 1)) tx_buf->offset += (((tx_buf->len - 1) / 4) + 1) * 4; if (i == (tx_skb_cnt - 1)) tx_buf->len += tx_buf->offset; __skb_queue_tail(&tx_buf->skb_queue, nskb); TX_STAT_INC(hif_dev, skb_queued); } usb_fill_bulk_urb(tx_buf->urb, hif_dev->udev, usb_sndbulkpipe(hif_dev->udev, USB_WLAN_TX_PIPE), tx_buf->buf, tx_buf->len, hif_usb_tx_cb, tx_buf); ret = usb_submit_urb(tx_buf->urb, GFP_ATOMIC); if (ret) { tx_buf->len = tx_buf->offset = 0; ath9k_skb_queue_complete(hif_dev, &tx_buf->skb_queue, false); __skb_queue_head_init(&tx_buf->skb_queue); list_move_tail(&tx_buf->list, &hif_dev->tx.tx_buf); hif_dev->tx.tx_buf_cnt++; } else { TX_STAT_INC(hif_dev, buf_queued); } return ret; } static int hif_usb_send_tx(struct hif_device_usb *hif_dev, struct sk_buff *skb) { struct ath9k_htc_tx_ctl *tx_ctl; unsigned long flags; int ret = 0; spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); if (hif_dev->tx.flags & HIF_USB_TX_STOP) { spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); return -ENODEV; } /* Check if the max queue count has been reached */ if (hif_dev->tx.tx_skb_cnt > MAX_TX_BUF_NUM) { spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); return -ENOMEM; } spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); tx_ctl = HTC_SKB_CB(skb); /* Mgmt/Beacon frames don't use the TX buffer pool */ if ((tx_ctl->type == ATH9K_HTC_MGMT) || (tx_ctl->type == ATH9K_HTC_BEACON)) { ret = hif_usb_send_mgmt(hif_dev, skb); } spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); if ((tx_ctl->type == ATH9K_HTC_NORMAL) || (tx_ctl->type == ATH9K_HTC_AMPDU)) { __skb_queue_tail(&hif_dev->tx.tx_skb_queue, skb); hif_dev->tx.tx_skb_cnt++; } /* Check if AMPDUs have to be sent immediately */ if ((hif_dev->tx.tx_buf_cnt == MAX_TX_URB_NUM) && (hif_dev->tx.tx_skb_cnt < 2)) { __hif_usb_tx(hif_dev); } spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); return ret; } static void hif_usb_start(void *hif_handle) { struct hif_device_usb *hif_dev = hif_handle; unsigned long flags; hif_dev->flags |= HIF_USB_START; spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); hif_dev->tx.flags &= ~HIF_USB_TX_STOP; spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); } static void hif_usb_stop(void *hif_handle) { struct hif_device_usb *hif_dev = hif_handle; struct tx_buf *tx_buf = NULL, *tx_buf_tmp = NULL; unsigned long flags; spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); ath9k_skb_queue_complete(hif_dev, &hif_dev->tx.tx_skb_queue, false); hif_dev->tx.tx_skb_cnt = 0; hif_dev->tx.flags |= HIF_USB_TX_STOP; spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); /* The pending URBs have to be canceled. */ spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); list_for_each_entry_safe(tx_buf, tx_buf_tmp, &hif_dev->tx.tx_pending, list) { usb_get_urb(tx_buf->urb); spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); usb_kill_urb(tx_buf->urb); list_del(&tx_buf->list); usb_free_urb(tx_buf->urb); kfree(tx_buf->buf); kfree(tx_buf); spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); } spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); usb_kill_anchored_urbs(&hif_dev->mgmt_submitted); } static int hif_usb_send(void *hif_handle, u8 pipe_id, struct sk_buff *skb) { struct hif_device_usb *hif_dev = hif_handle; int ret = 0; switch (pipe_id) { case USB_WLAN_TX_PIPE: ret = hif_usb_send_tx(hif_dev, skb); break; case USB_REG_OUT_PIPE: ret = hif_usb_send_regout(hif_dev, skb); break; default: dev_err(&hif_dev->udev->dev, "ath9k_htc: Invalid TX pipe: %d\n", pipe_id); ret = -EINVAL; break; } return ret; } static inline bool check_index(struct sk_buff *skb, u8 idx) { struct ath9k_htc_tx_ctl *tx_ctl; tx_ctl = HTC_SKB_CB(skb); if ((tx_ctl->type == ATH9K_HTC_AMPDU) && (tx_ctl->sta_idx == idx)) return true; return false; } static void hif_usb_sta_drain(void *hif_handle, u8 idx) { struct hif_device_usb *hif_dev = hif_handle; struct sk_buff *skb, *tmp; unsigned long flags; spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); skb_queue_walk_safe(&hif_dev->tx.tx_skb_queue, skb, tmp) { if (check_index(skb, idx)) { __skb_unlink(skb, &hif_dev->tx.tx_skb_queue); ath9k_htc_txcompletion_cb(hif_dev->htc_handle, skb, false); hif_dev->tx.tx_skb_cnt--; TX_STAT_INC(hif_dev, skb_failed); } } spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); } static struct ath9k_htc_hif hif_usb = { .transport = ATH9K_HIF_USB, .name = "ath9k_hif_usb", .control_ul_pipe = USB_REG_OUT_PIPE, .control_dl_pipe = USB_REG_IN_PIPE, .start = hif_usb_start, .stop = hif_usb_stop, .sta_drain = hif_usb_sta_drain, .send = hif_usb_send, }; /* Need to free remain_skb allocated in ath9k_hif_usb_rx_stream * in case ath9k_hif_usb_rx_stream wasn't called next time to * process the buffer and subsequently free it. */ static void ath9k_hif_usb_free_rx_remain_skb(struct hif_device_usb *hif_dev) { unsigned long flags; spin_lock_irqsave(&hif_dev->rx_lock, flags); if (hif_dev->remain_skb) { dev_kfree_skb_any(hif_dev->remain_skb); hif_dev->remain_skb = NULL; hif_dev->rx_remain_len = 0; RX_STAT_INC(hif_dev, skb_dropped); } spin_unlock_irqrestore(&hif_dev->rx_lock, flags); } static void ath9k_hif_usb_rx_stream(struct hif_device_usb *hif_dev, struct sk_buff *skb) { struct sk_buff *nskb, *skb_pool[MAX_PKT_NUM_IN_TRANSFER]; int index = 0, i, len = skb->len; int rx_remain_len, rx_pkt_len; u16 pool_index = 0; u8 *ptr; spin_lock(&hif_dev->rx_lock); rx_remain_len = hif_dev->rx_remain_len; rx_pkt_len = hif_dev->rx_transfer_len; if (rx_remain_len != 0) { struct sk_buff *remain_skb = hif_dev->remain_skb; if (remain_skb) { ptr = (u8 *) remain_skb->data; index = rx_remain_len; rx_remain_len -= hif_dev->rx_pad_len; ptr += rx_pkt_len; memcpy(ptr, skb->data, rx_remain_len); rx_pkt_len += rx_remain_len; skb_put(remain_skb, rx_pkt_len); skb_pool[pool_index++] = remain_skb; hif_dev->remain_skb = NULL; hif_dev->rx_remain_len = 0; } else { index = rx_remain_len; } } spin_unlock(&hif_dev->rx_lock); while (index < len) { u16 pkt_len; u16 pkt_tag; u16 pad_len; int chk_idx; ptr = (u8 *) skb->data; pkt_len = get_unaligned_le16(ptr + index); pkt_tag = get_unaligned_le16(ptr + index + 2); /* It is supposed that if we have an invalid pkt_tag or * pkt_len then the whole input SKB is considered invalid * and dropped; the associated packets already in skb_pool * are dropped, too. */ if (pkt_tag != ATH_USB_RX_STREAM_MODE_TAG) { RX_STAT_INC(hif_dev, skb_dropped); goto invalid_pkt; } if (pkt_len > 2 * MAX_RX_BUF_SIZE) { dev_err(&hif_dev->udev->dev, "ath9k_htc: invalid pkt_len (%x)\n", pkt_len); RX_STAT_INC(hif_dev, skb_dropped); goto invalid_pkt; } pad_len = 4 - (pkt_len & 0x3); if (pad_len == 4) pad_len = 0; chk_idx = index; index = index + 4 + pkt_len + pad_len; if (index > MAX_RX_BUF_SIZE) { spin_lock(&hif_dev->rx_lock); nskb = __dev_alloc_skb(pkt_len + 32, GFP_ATOMIC); if (!nskb) { dev_err(&hif_dev->udev->dev, "ath9k_htc: RX memory allocation error\n"); spin_unlock(&hif_dev->rx_lock); goto err; } hif_dev->rx_remain_len = index - MAX_RX_BUF_SIZE; hif_dev->rx_transfer_len = MAX_RX_BUF_SIZE - chk_idx - 4; hif_dev->rx_pad_len = pad_len; skb_reserve(nskb, 32); RX_STAT_INC(hif_dev, skb_allocated); memcpy(nskb->data, &(skb->data[chk_idx+4]), hif_dev->rx_transfer_len); /* Record the buffer pointer */ hif_dev->remain_skb = nskb; spin_unlock(&hif_dev->rx_lock); } else { if (pool_index == MAX_PKT_NUM_IN_TRANSFER) { dev_err(&hif_dev->udev->dev, "ath9k_htc: over RX MAX_PKT_NUM\n"); goto err; } nskb = __dev_alloc_skb(pkt_len + 32, GFP_ATOMIC); if (!nskb) { dev_err(&hif_dev->udev->dev, "ath9k_htc: RX memory allocation error\n"); goto err; } skb_reserve(nskb, 32); RX_STAT_INC(hif_dev, skb_allocated); memcpy(nskb->data, &(skb->data[chk_idx+4]), pkt_len); skb_put(nskb, pkt_len); skb_pool[pool_index++] = nskb; } } err: for (i = 0; i < pool_index; i++) { RX_STAT_ADD(hif_dev, skb_completed_bytes, skb_pool[i]->len); ath9k_htc_rx_msg(hif_dev->htc_handle, skb_pool[i], skb_pool[i]->len, USB_WLAN_RX_PIPE); RX_STAT_INC(hif_dev, skb_completed); } return; invalid_pkt: for (i = 0; i < pool_index; i++) { dev_kfree_skb_any(skb_pool[i]); RX_STAT_INC(hif_dev, skb_dropped); } return; } static void ath9k_hif_usb_rx_cb(struct urb *urb) { struct rx_buf *rx_buf = urb->context; struct hif_device_usb *hif_dev = rx_buf->hif_dev; struct sk_buff *skb = rx_buf->skb; int ret; if (!skb) return; if (!hif_dev) goto free; switch (urb->status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ENODEV: case -ESHUTDOWN: goto free; default: goto resubmit; } if (likely(urb->actual_length != 0)) { skb_put(skb, urb->actual_length); ath9k_hif_usb_rx_stream(hif_dev, skb); } resubmit: __skb_set_length(skb, 0); usb_anchor_urb(urb, &hif_dev->rx_submitted); ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret) { usb_unanchor_urb(urb); goto free; } return; free: kfree_skb(skb); kfree(rx_buf); } static void ath9k_hif_usb_reg_in_cb(struct urb *urb) { struct rx_buf *rx_buf = urb->context; struct hif_device_usb *hif_dev = rx_buf->hif_dev; struct sk_buff *skb = rx_buf->skb; int ret; if (!skb) return; if (!hif_dev) goto free_skb; switch (urb->status) { case 0: break; case -ENOENT: case -ECONNRESET: case -ENODEV: case -ESHUTDOWN: goto free_skb; default: __skb_set_length(skb, 0); goto resubmit; } if (likely(urb->actual_length != 0)) { skb_put(skb, urb->actual_length); /* * Process the command first. * skb is either freed here or passed to be * managed to another callback function. */ ath9k_htc_rx_msg(hif_dev->htc_handle, skb, skb->len, USB_REG_IN_PIPE); skb = alloc_skb(MAX_REG_IN_BUF_SIZE, GFP_ATOMIC); if (!skb) { dev_err(&hif_dev->udev->dev, "ath9k_htc: REG_IN memory allocation failure\n"); goto free_rx_buf; } rx_buf->skb = skb; usb_fill_int_urb(urb, hif_dev->udev, usb_rcvintpipe(hif_dev->udev, USB_REG_IN_PIPE), skb->data, MAX_REG_IN_BUF_SIZE, ath9k_hif_usb_reg_in_cb, rx_buf, 1); } resubmit: usb_anchor_urb(urb, &hif_dev->reg_in_submitted); ret = usb_submit_urb(urb, GFP_ATOMIC); if (ret) { usb_unanchor_urb(urb); goto free_skb; } return; free_skb: kfree_skb(skb); free_rx_buf: kfree(rx_buf); urb->context = NULL; } static void ath9k_hif_usb_dealloc_tx_urbs(struct hif_device_usb *hif_dev) { struct tx_buf *tx_buf = NULL, *tx_buf_tmp = NULL; unsigned long flags; spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); list_for_each_entry_safe(tx_buf, tx_buf_tmp, &hif_dev->tx.tx_buf, list) { list_del(&tx_buf->list); usb_free_urb(tx_buf->urb); kfree(tx_buf->buf); kfree(tx_buf); } spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); hif_dev->tx.flags |= HIF_USB_TX_FLUSH; spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); list_for_each_entry_safe(tx_buf, tx_buf_tmp, &hif_dev->tx.tx_pending, list) { usb_get_urb(tx_buf->urb); spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); usb_kill_urb(tx_buf->urb); list_del(&tx_buf->list); usb_free_urb(tx_buf->urb); kfree(tx_buf->buf); kfree(tx_buf); spin_lock_irqsave(&hif_dev->tx.tx_lock, flags); } spin_unlock_irqrestore(&hif_dev->tx.tx_lock, flags); usb_kill_anchored_urbs(&hif_dev->mgmt_submitted); } static int ath9k_hif_usb_alloc_tx_urbs(struct hif_device_usb *hif_dev) { struct tx_buf *tx_buf; int i; INIT_LIST_HEAD(&hif_dev->tx.tx_buf); INIT_LIST_HEAD(&hif_dev->tx.tx_pending); spin_lock_init(&hif_dev->tx.tx_lock); __skb_queue_head_init(&hif_dev->tx.tx_skb_queue); init_usb_anchor(&hif_dev->mgmt_submitted); for (i = 0; i < MAX_TX_URB_NUM; i++) { tx_buf = kzalloc(sizeof(*tx_buf), GFP_KERNEL); if (!tx_buf) goto err; tx_buf->buf = kzalloc(MAX_TX_BUF_SIZE, GFP_KERNEL); if (!tx_buf->buf) goto err; tx_buf->urb = usb_alloc_urb(0, GFP_KERNEL); if (!tx_buf->urb) goto err; tx_buf->hif_dev = hif_dev; __skb_queue_head_init(&tx_buf->skb_queue); list_add_tail(&tx_buf->list, &hif_dev->tx.tx_buf); } hif_dev->tx.tx_buf_cnt = MAX_TX_URB_NUM; return 0; err: if (tx_buf) { kfree(tx_buf->buf); kfree(tx_buf); } ath9k_hif_usb_dealloc_tx_urbs(hif_dev); return -ENOMEM; } static void ath9k_hif_usb_dealloc_rx_urbs(struct hif_device_usb *hif_dev) { usb_kill_anchored_urbs(&hif_dev->rx_submitted); ath9k_hif_usb_free_rx_remain_skb(hif_dev); } static int ath9k_hif_usb_alloc_rx_urbs(struct hif_device_usb *hif_dev) { struct rx_buf *rx_buf = NULL; struct sk_buff *skb = NULL; struct urb *urb = NULL; int i, ret; init_usb_anchor(&hif_dev->rx_submitted); spin_lock_init(&hif_dev->rx_lock); for (i = 0; i < MAX_RX_URB_NUM; i++) { rx_buf = kzalloc(sizeof(*rx_buf), GFP_KERNEL); if (!rx_buf) { ret = -ENOMEM; goto err_rxb; } /* Allocate URB */ urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) { ret = -ENOMEM; goto err_urb; } /* Allocate buffer */ skb = alloc_skb(MAX_RX_BUF_SIZE, GFP_KERNEL); if (!skb) { ret = -ENOMEM; goto err_skb; } rx_buf->hif_dev = hif_dev; rx_buf->skb = skb; usb_fill_bulk_urb(urb, hif_dev->udev, usb_rcvbulkpipe(hif_dev->udev, USB_WLAN_RX_PIPE), skb->data, MAX_RX_BUF_SIZE, ath9k_hif_usb_rx_cb, rx_buf); /* Anchor URB */ usb_anchor_urb(urb, &hif_dev->rx_submitted); /* Submit URB */ ret = usb_submit_urb(urb, GFP_KERNEL); if (ret) { usb_unanchor_urb(urb); goto err_submit; } /* * Drop reference count. * This ensures that the URB is freed when killing them. */ usb_free_urb(urb); } return 0; err_submit: kfree_skb(skb); err_skb: usb_free_urb(urb); err_urb: kfree(rx_buf); err_rxb: ath9k_hif_usb_dealloc_rx_urbs(hif_dev); return ret; } static void ath9k_hif_usb_dealloc_reg_in_urbs(struct hif_device_usb *hif_dev) { usb_kill_anchored_urbs(&hif_dev->reg_in_submitted); } static int ath9k_hif_usb_alloc_reg_in_urbs(struct hif_device_usb *hif_dev) { struct rx_buf *rx_buf = NULL; struct sk_buff *skb = NULL; struct urb *urb = NULL; int i, ret; init_usb_anchor(&hif_dev->reg_in_submitted); for (i = 0; i < MAX_REG_IN_URB_NUM; i++) { rx_buf = kzalloc(sizeof(*rx_buf), GFP_KERNEL); if (!rx_buf) { ret = -ENOMEM; goto err_rxb; } /* Allocate URB */ urb = usb_alloc_urb(0, GFP_KERNEL); if (urb == NULL) { ret = -ENOMEM; goto err_urb; } /* Allocate buffer */ skb = alloc_skb(MAX_REG_IN_BUF_SIZE, GFP_KERNEL); if (!skb) { ret = -ENOMEM; goto err_skb; } rx_buf->hif_dev = hif_dev; rx_buf->skb = skb; usb_fill_int_urb(urb, hif_dev->udev, usb_rcvintpipe(hif_dev->udev, USB_REG_IN_PIPE), skb->data, MAX_REG_IN_BUF_SIZE, ath9k_hif_usb_reg_in_cb, rx_buf, 1); /* Anchor URB */ usb_anchor_urb(urb, &hif_dev->reg_in_submitted); /* Submit URB */ ret = usb_submit_urb(urb, GFP_KERNEL); if (ret) { usb_unanchor_urb(urb); goto err_submit; } /* * Drop reference count. * This ensures that the URB is freed when killing them. */ usb_free_urb(urb); } return 0; err_submit: kfree_skb(skb); err_skb: usb_free_urb(urb); err_urb: kfree(rx_buf); err_rxb: ath9k_hif_usb_dealloc_reg_in_urbs(hif_dev); return ret; } static int ath9k_hif_usb_alloc_urbs(struct hif_device_usb *hif_dev) { /* Register Write */ init_usb_anchor(&hif_dev->regout_submitted); /* TX */ if (ath9k_hif_usb_alloc_tx_urbs(hif_dev) < 0) goto err; /* RX */ if (ath9k_hif_usb_alloc_rx_urbs(hif_dev) < 0) goto err_rx; /* Register Read */ if (ath9k_hif_usb_alloc_reg_in_urbs(hif_dev) < 0) goto err_reg; return 0; err_reg: ath9k_hif_usb_dealloc_rx_urbs(hif_dev); err_rx: ath9k_hif_usb_dealloc_tx_urbs(hif_dev); err: return -ENOMEM; } void ath9k_hif_usb_dealloc_urbs(struct hif_device_usb *hif_dev) { usb_kill_anchored_urbs(&hif_dev->regout_submitted); ath9k_hif_usb_dealloc_reg_in_urbs(hif_dev); ath9k_hif_usb_dealloc_tx_urbs(hif_dev); ath9k_hif_usb_dealloc_rx_urbs(hif_dev); } static int ath9k_hif_usb_download_fw(struct hif_device_usb *hif_dev) { int transfer, err; const void *data = hif_dev->fw_data; size_t len = hif_dev->fw_size; u32 addr = AR9271_FIRMWARE; u8 *buf = kzalloc(4096, GFP_KERNEL); u32 firm_offset; if (!buf) return -ENOMEM; while (len) { transfer = min_t(size_t, len, 4096); memcpy(buf, data, transfer); err = usb_control_msg(hif_dev->udev, usb_sndctrlpipe(hif_dev->udev, 0), FIRMWARE_DOWNLOAD, 0x40 | USB_DIR_OUT, addr >> 8, 0, buf, transfer, USB_MSG_TIMEOUT); if (err < 0) { kfree(buf); return err; } len -= transfer; data += transfer; addr += transfer; } kfree(buf); if (IS_AR7010_DEVICE(hif_dev->usb_device_id->driver_info)) firm_offset = AR7010_FIRMWARE_TEXT; else firm_offset = AR9271_FIRMWARE_TEXT; /* * Issue FW download complete command to firmware. */ err = usb_control_msg(hif_dev->udev, usb_sndctrlpipe(hif_dev->udev, 0), FIRMWARE_DOWNLOAD_COMP, 0x40 | USB_DIR_OUT, firm_offset >> 8, 0, NULL, 0, USB_MSG_TIMEOUT); if (err) return -EIO; dev_info(&hif_dev->udev->dev, "ath9k_htc: Transferred FW: %s, size: %ld\n", hif_dev->fw_name, (unsigned long) hif_dev->fw_size); return 0; } static int ath9k_hif_usb_dev_init(struct hif_device_usb *hif_dev) { int ret; ret = ath9k_hif_usb_download_fw(hif_dev); if (ret) { dev_err(&hif_dev->udev->dev, "ath9k_htc: Firmware - %s download failed\n", hif_dev->fw_name); return ret; } /* Alloc URBs */ ret = ath9k_hif_usb_alloc_urbs(hif_dev); if (ret) { dev_err(&hif_dev->udev->dev, "ath9k_htc: Unable to allocate URBs\n"); return ret; } return 0; } static void ath9k_hif_usb_dev_deinit(struct hif_device_usb *hif_dev) { ath9k_hif_usb_dealloc_urbs(hif_dev); } /* * If initialization fails or the FW cannot be retrieved, * detach the device. */ static void ath9k_hif_usb_firmware_fail(struct hif_device_usb *hif_dev) { struct device *dev = &hif_dev->udev->dev; struct device *parent = dev->parent; complete_all(&hif_dev->fw_done); if (parent) device_lock(parent); device_release_driver(dev); if (parent) device_unlock(parent); } static void ath9k_hif_usb_firmware_cb(const struct firmware *fw, void *context); /* taken from iwlwifi */ static int ath9k_hif_request_firmware(struct hif_device_usb *hif_dev, bool first) { char index[8], *chip; int ret; if (first) { if (htc_use_dev_fw) { hif_dev->fw_minor_index = FIRMWARE_MINOR_IDX_MAX + 1; sprintf(index, "%s", "dev"); } else { hif_dev->fw_minor_index = FIRMWARE_MINOR_IDX_MAX; sprintf(index, "%d", hif_dev->fw_minor_index); } } else { hif_dev->fw_minor_index--; sprintf(index, "%d", hif_dev->fw_minor_index); } /* test for FW 1.3 */ if (MAJOR_VERSION_REQ == 1 && hif_dev->fw_minor_index == 3) { const char *filename; if (IS_AR7010_DEVICE(hif_dev->usb_device_id->driver_info)) filename = FIRMWARE_AR7010_1_1; else filename = FIRMWARE_AR9271; /* expected fw locations: * - htc_9271.fw (stable version 1.3, deprecated) */ snprintf(hif_dev->fw_name, sizeof(hif_dev->fw_name), "%s", filename); } else if (hif_dev->fw_minor_index < FIRMWARE_MINOR_IDX_MIN) { dev_err(&hif_dev->udev->dev, "no suitable firmware found!\n"); return -ENOENT; } else { if (IS_AR7010_DEVICE(hif_dev->usb_device_id->driver_info)) chip = "7010"; else chip = "9271"; /* expected fw locations: * - ath9k_htc/htc_9271-1.dev.0.fw (development version) * - ath9k_htc/htc_9271-1.4.0.fw (stable version) */ snprintf(hif_dev->fw_name, sizeof(hif_dev->fw_name), "%s/htc_%s-%d.%s.0.fw", HTC_FW_PATH, chip, MAJOR_VERSION_REQ, index); } ret = request_firmware_nowait(THIS_MODULE, true, hif_dev->fw_name, &hif_dev->udev->dev, GFP_KERNEL, hif_dev, ath9k_hif_usb_firmware_cb); if (ret) { dev_err(&hif_dev->udev->dev, "ath9k_htc: Async request for firmware %s failed\n", hif_dev->fw_name); return ret; } dev_info(&hif_dev->udev->dev, "ath9k_htc: Firmware %s requested\n", hif_dev->fw_name); return ret; } static void ath9k_hif_usb_firmware_cb(const struct firmware *fw, void *context) { struct hif_device_usb *hif_dev = context; int ret; if (!fw) { ret = ath9k_hif_request_firmware(hif_dev, false); if (!ret) return; dev_err(&hif_dev->udev->dev, "ath9k_htc: Failed to get firmware %s\n", hif_dev->fw_name); goto err_fw; } hif_dev->htc_handle = ath9k_htc_hw_alloc(hif_dev, &hif_usb, &hif_dev->udev->dev); if (hif_dev->htc_handle == NULL) goto err_dev_alloc; hif_dev->fw_data = fw->data; hif_dev->fw_size = fw->size; /* Proceed with initialization */ ret = ath9k_hif_usb_dev_init(hif_dev); if (ret) goto err_dev_init; ret = ath9k_htc_hw_init(hif_dev->htc_handle, &hif_dev->interface->dev, hif_dev->usb_device_id->idProduct, hif_dev->udev->product, hif_dev->usb_device_id->driver_info); if (ret) { ret = -EINVAL; goto err_htc_hw_init; } release_firmware(fw); hif_dev->flags |= HIF_USB_READY; complete_all(&hif_dev->fw_done); return; err_htc_hw_init: ath9k_hif_usb_dev_deinit(hif_dev); err_dev_init: ath9k_htc_hw_free(hif_dev->htc_handle); err_dev_alloc: release_firmware(fw); err_fw: ath9k_hif_usb_firmware_fail(hif_dev); } /* * An exact copy of the function from zd1211rw. */ static int send_eject_command(struct usb_interface *interface) { struct usb_device *udev = interface_to_usbdev(interface); struct usb_host_interface *iface_desc = interface->cur_altsetting; struct usb_endpoint_descriptor *endpoint; unsigned char *cmd; u8 bulk_out_ep; int r; if (iface_desc->desc.bNumEndpoints < 2) return -ENODEV; /* Find bulk out endpoint */ for (r = 1; r >= 0; r--) { endpoint = &iface_desc->endpoint[r].desc; if (usb_endpoint_dir_out(endpoint) && usb_endpoint_xfer_bulk(endpoint)) { bulk_out_ep = endpoint->bEndpointAddress; break; } } if (r == -1) { dev_err(&udev->dev, "ath9k_htc: Could not find bulk out endpoint\n"); return -ENODEV; } cmd = kzalloc(31, GFP_KERNEL); if (cmd == NULL) return -ENODEV; /* USB bulk command block */ cmd[0] = 0x55; /* bulk command signature */ cmd[1] = 0x53; /* bulk command signature */ cmd[2] = 0x42; /* bulk command signature */ cmd[3] = 0x43; /* bulk command signature */ cmd[14] = 6; /* command length */ cmd[15] = 0x1b; /* SCSI command: START STOP UNIT */ cmd[19] = 0x2; /* eject disc */ dev_info(&udev->dev, "Ejecting storage device...\n"); r = usb_bulk_msg(udev, usb_sndbulkpipe(udev, bulk_out_ep), cmd, 31, NULL, 2 * USB_MSG_TIMEOUT); kfree(cmd); if (r) return r; /* At this point, the device disconnects and reconnects with the real * ID numbers. */ usb_set_intfdata(interface, NULL); return 0; } static int ath9k_hif_usb_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_endpoint_descriptor *bulk_in, *bulk_out, *int_in, *int_out; struct usb_device *udev = interface_to_usbdev(interface); struct usb_host_interface *alt; struct hif_device_usb *hif_dev; int ret = 0; /* Verify the expected endpoints are present */ alt = interface->cur_altsetting; if (usb_find_common_endpoints(alt, &bulk_in, &bulk_out, &int_in, &int_out) < 0 || usb_endpoint_num(bulk_in) != USB_WLAN_RX_PIPE || usb_endpoint_num(bulk_out) != USB_WLAN_TX_PIPE || usb_endpoint_num(int_in) != USB_REG_IN_PIPE || usb_endpoint_num(int_out) != USB_REG_OUT_PIPE) { dev_err(&udev->dev, "ath9k_htc: Device endpoint numbers are not the expected ones\n"); return -ENODEV; } if (id->driver_info == STORAGE_DEVICE) return send_eject_command(interface); hif_dev = kzalloc(sizeof(struct hif_device_usb), GFP_KERNEL); if (!hif_dev) { ret = -ENOMEM; goto err_alloc; } usb_get_dev(udev); hif_dev->udev = udev; hif_dev->interface = interface; hif_dev->usb_device_id = id; #ifdef CONFIG_PM udev->reset_resume = 1; #endif usb_set_intfdata(interface, hif_dev); init_completion(&hif_dev->fw_done); ret = ath9k_hif_request_firmware(hif_dev, true); if (ret) goto err_fw_req; return ret; err_fw_req: usb_set_intfdata(interface, NULL); kfree(hif_dev); usb_put_dev(udev); err_alloc: return ret; } static void ath9k_hif_usb_reboot(struct usb_device *udev) { u32 reboot_cmd = 0xffffffff; void *buf; int ret; buf = kmemdup(&reboot_cmd, 4, GFP_KERNEL); if (!buf) return; ret = usb_interrupt_msg(udev, usb_sndintpipe(udev, USB_REG_OUT_PIPE), buf, 4, NULL, USB_MSG_TIMEOUT); if (ret) dev_err(&udev->dev, "ath9k_htc: USB reboot failed\n"); kfree(buf); } static void ath9k_hif_usb_disconnect(struct usb_interface *interface) { struct usb_device *udev = interface_to_usbdev(interface); struct hif_device_usb *hif_dev = usb_get_intfdata(interface); bool unplugged = udev->state == USB_STATE_NOTATTACHED; if (!hif_dev) return; wait_for_completion(&hif_dev->fw_done); if (hif_dev->flags & HIF_USB_READY) { ath9k_htc_hw_deinit(hif_dev->htc_handle, unplugged); ath9k_htc_hw_free(hif_dev->htc_handle); } usb_set_intfdata(interface, NULL); /* If firmware was loaded we should drop it * go back to first stage bootloader. */ if (!unplugged && (hif_dev->flags & HIF_USB_READY)) ath9k_hif_usb_reboot(udev); kfree(hif_dev); dev_info(&udev->dev, "ath9k_htc: USB layer deinitialized\n"); usb_put_dev(udev); } #ifdef CONFIG_PM static int ath9k_hif_usb_suspend(struct usb_interface *interface, pm_message_t message) { struct hif_device_usb *hif_dev = usb_get_intfdata(interface); /* * The device has to be set to FULLSLEEP mode in case no * interface is up. */ if (!(hif_dev->flags & HIF_USB_START)) ath9k_htc_suspend(hif_dev->htc_handle); wait_for_completion(&hif_dev->fw_done); if (hif_dev->flags & HIF_USB_READY) ath9k_hif_usb_dealloc_urbs(hif_dev); return 0; } static int ath9k_hif_usb_resume(struct usb_interface *interface) { struct hif_device_usb *hif_dev = usb_get_intfdata(interface); struct htc_target *htc_handle = hif_dev->htc_handle; const struct firmware *fw; int ret; ret = ath9k_hif_usb_alloc_urbs(hif_dev); if (ret) return ret; if (!(hif_dev->flags & HIF_USB_READY)) { ret = -EIO; goto fail_resume; } /* request cached firmware during suspend/resume cycle */ ret = request_firmware(&fw, hif_dev->fw_name, &hif_dev->udev->dev); if (ret) goto fail_resume; hif_dev->fw_data = fw->data; hif_dev->fw_size = fw->size; ret = ath9k_hif_usb_download_fw(hif_dev); release_firmware(fw); if (ret) goto fail_resume; mdelay(100); ret = ath9k_htc_resume(htc_handle); if (ret) goto fail_resume; return 0; fail_resume: ath9k_hif_usb_dealloc_urbs(hif_dev); return ret; } #endif static struct usb_driver ath9k_hif_usb_driver = { .name = KBUILD_MODNAME, .probe = ath9k_hif_usb_probe, .disconnect = ath9k_hif_usb_disconnect, #ifdef CONFIG_PM .suspend = ath9k_hif_usb_suspend, .resume = ath9k_hif_usb_resume, .reset_resume = ath9k_hif_usb_resume, #endif .id_table = ath9k_hif_usb_ids, .soft_unbind = 1, .disable_hub_initiated_lpm = 1, }; int ath9k_hif_usb_init(void) { return usb_register(&ath9k_hif_usb_driver); } void ath9k_hif_usb_exit(void) { usb_deregister(&ath9k_hif_usb_driver); } |
27 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 | // SPDX-License-Identifier: GPL-2.0+ /* * Helpers for controlling modem lines via GPIO * * Copyright (C) 2014 Paratronic S.A. */ #include <linux/err.h> #include <linux/device.h> #include <linux/irq.h> #include <linux/gpio/consumer.h> #include <linux/termios.h> #include <linux/serial_core.h> #include <linux/module.h> #include <linux/property.h> #include "serial_mctrl_gpio.h" struct mctrl_gpios { struct uart_port *port; struct gpio_desc *gpio[UART_GPIO_MAX]; int irq[UART_GPIO_MAX]; unsigned int mctrl_prev; bool mctrl_on; }; static const struct { const char *name; unsigned int mctrl; enum gpiod_flags flags; } mctrl_gpios_desc[UART_GPIO_MAX] = { { "cts", TIOCM_CTS, GPIOD_IN, }, { "dsr", TIOCM_DSR, GPIOD_IN, }, { "dcd", TIOCM_CD, GPIOD_IN, }, { "rng", TIOCM_RNG, GPIOD_IN, }, { "rts", TIOCM_RTS, GPIOD_OUT_LOW, }, { "dtr", TIOCM_DTR, GPIOD_OUT_LOW, }, }; static bool mctrl_gpio_flags_is_dir_out(unsigned int idx) { return mctrl_gpios_desc[idx].flags & GPIOD_FLAGS_BIT_DIR_OUT; } /** * mctrl_gpio_set - set gpios according to mctrl state * @gpios: gpios to set * @mctrl: state to set * * Set the gpios according to the mctrl state. */ void mctrl_gpio_set(struct mctrl_gpios *gpios, unsigned int mctrl) { enum mctrl_gpio_idx i; struct gpio_desc *desc_array[UART_GPIO_MAX]; DECLARE_BITMAP(values, UART_GPIO_MAX); unsigned int count = 0; if (gpios == NULL) return; for (i = 0; i < UART_GPIO_MAX; i++) if (gpios->gpio[i] && mctrl_gpio_flags_is_dir_out(i)) { desc_array[count] = gpios->gpio[i]; __assign_bit(count, values, mctrl & mctrl_gpios_desc[i].mctrl); count++; } gpiod_set_array_value(count, desc_array, NULL, values); } EXPORT_SYMBOL_GPL(mctrl_gpio_set); /** * mctrl_gpio_to_gpiod - obtain gpio_desc of modem line index * @gpios: gpios to look into * @gidx: index of the modem line * Returns: the gpio_desc structure associated to the modem line index */ struct gpio_desc *mctrl_gpio_to_gpiod(struct mctrl_gpios *gpios, enum mctrl_gpio_idx gidx) { if (gpios == NULL) return NULL; return gpios->gpio[gidx]; } EXPORT_SYMBOL_GPL(mctrl_gpio_to_gpiod); /** * mctrl_gpio_get - update mctrl with the gpios values. * @gpios: gpios to get the info from * @mctrl: mctrl to set * Returns: modified mctrl (the same value as in @mctrl) * * Update mctrl with the gpios values. */ unsigned int mctrl_gpio_get(struct mctrl_gpios *gpios, unsigned int *mctrl) { enum mctrl_gpio_idx i; if (gpios == NULL) return *mctrl; for (i = 0; i < UART_GPIO_MAX; i++) { if (gpios->gpio[i] && !mctrl_gpio_flags_is_dir_out(i)) { if (gpiod_get_value(gpios->gpio[i])) *mctrl |= mctrl_gpios_desc[i].mctrl; else *mctrl &= ~mctrl_gpios_desc[i].mctrl; } } return *mctrl; } EXPORT_SYMBOL_GPL(mctrl_gpio_get); unsigned int mctrl_gpio_get_outputs(struct mctrl_gpios *gpios, unsigned int *mctrl) { enum mctrl_gpio_idx i; if (gpios == NULL) return *mctrl; for (i = 0; i < UART_GPIO_MAX; i++) { if (gpios->gpio[i] && mctrl_gpio_flags_is_dir_out(i)) { if (gpiod_get_value(gpios->gpio[i])) *mctrl |= mctrl_gpios_desc[i].mctrl; else *mctrl &= ~mctrl_gpios_desc[i].mctrl; } } return *mctrl; } EXPORT_SYMBOL_GPL(mctrl_gpio_get_outputs); struct mctrl_gpios *mctrl_gpio_init_noauto(struct device *dev, unsigned int idx) { struct mctrl_gpios *gpios; enum mctrl_gpio_idx i; gpios = devm_kzalloc(dev, sizeof(*gpios), GFP_KERNEL); if (!gpios) return ERR_PTR(-ENOMEM); for (i = 0; i < UART_GPIO_MAX; i++) { char *gpio_str; bool present; /* Check if GPIO property exists and continue if not */ gpio_str = kasprintf(GFP_KERNEL, "%s-gpios", mctrl_gpios_desc[i].name); if (!gpio_str) continue; present = device_property_present(dev, gpio_str); kfree(gpio_str); if (!present) continue; gpios->gpio[i] = devm_gpiod_get_index_optional(dev, mctrl_gpios_desc[i].name, idx, mctrl_gpios_desc[i].flags); if (IS_ERR(gpios->gpio[i])) return ERR_CAST(gpios->gpio[i]); } return gpios; } EXPORT_SYMBOL_GPL(mctrl_gpio_init_noauto); #define MCTRL_ANY_DELTA (TIOCM_RI | TIOCM_DSR | TIOCM_CD | TIOCM_CTS) static irqreturn_t mctrl_gpio_irq_handle(int irq, void *context) { struct mctrl_gpios *gpios = context; struct uart_port *port = gpios->port; u32 mctrl = gpios->mctrl_prev; u32 mctrl_diff; unsigned long flags; mctrl_gpio_get(gpios, &mctrl); uart_port_lock_irqsave(port, &flags); mctrl_diff = mctrl ^ gpios->mctrl_prev; gpios->mctrl_prev = mctrl; if (mctrl_diff & MCTRL_ANY_DELTA && port->state != NULL) { if ((mctrl_diff & mctrl) & TIOCM_RI) port->icount.rng++; if ((mctrl_diff & mctrl) & TIOCM_DSR) port->icount.dsr++; if (mctrl_diff & TIOCM_CD) uart_handle_dcd_change(port, mctrl & TIOCM_CD); if (mctrl_diff & TIOCM_CTS) uart_handle_cts_change(port, mctrl & TIOCM_CTS); wake_up_interruptible(&port->state->port.delta_msr_wait); } uart_port_unlock_irqrestore(port, flags); return IRQ_HANDLED; } /** * mctrl_gpio_init - initialize uart gpios * @port: port to initialize gpios for * @idx: index of the gpio in the @port's device * * This will get the {cts,rts,...}-gpios from device tree if they are present * and request them, set direction etc, and return an allocated structure. * `devm_*` functions are used, so there's no need to explicitly free. * As this sets up the irq handling, make sure to not handle changes to the * gpio input lines in your driver, too. */ struct mctrl_gpios *mctrl_gpio_init(struct uart_port *port, unsigned int idx) { struct mctrl_gpios *gpios; enum mctrl_gpio_idx i; gpios = mctrl_gpio_init_noauto(port->dev, idx); if (IS_ERR(gpios)) return gpios; gpios->port = port; for (i = 0; i < UART_GPIO_MAX; ++i) { int ret; if (!gpios->gpio[i] || mctrl_gpio_flags_is_dir_out(i)) continue; ret = gpiod_to_irq(gpios->gpio[i]); if (ret < 0) { dev_err(port->dev, "failed to find corresponding irq for %s (idx=%d, err=%d)\n", mctrl_gpios_desc[i].name, idx, ret); return ERR_PTR(ret); } gpios->irq[i] = ret; /* irqs should only be enabled in .enable_ms */ irq_set_status_flags(gpios->irq[i], IRQ_NOAUTOEN); ret = devm_request_irq(port->dev, gpios->irq[i], mctrl_gpio_irq_handle, IRQ_TYPE_EDGE_BOTH, dev_name(port->dev), gpios); if (ret) { /* alternatively implement polling */ dev_err(port->dev, "failed to request irq for %s (idx=%d, err=%d)\n", mctrl_gpios_desc[i].name, idx, ret); return ERR_PTR(ret); } } return gpios; } EXPORT_SYMBOL_GPL(mctrl_gpio_init); /** * mctrl_gpio_enable_ms - enable irqs and handling of changes to the ms lines * @gpios: gpios to enable */ void mctrl_gpio_enable_ms(struct mctrl_gpios *gpios) { enum mctrl_gpio_idx i; if (gpios == NULL) return; /* .enable_ms may be called multiple times */ if (gpios->mctrl_on) return; gpios->mctrl_on = true; /* get initial status of modem lines GPIOs */ mctrl_gpio_get(gpios, &gpios->mctrl_prev); for (i = 0; i < UART_GPIO_MAX; ++i) { if (!gpios->irq[i]) continue; enable_irq(gpios->irq[i]); } } EXPORT_SYMBOL_GPL(mctrl_gpio_enable_ms); static void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios, bool sync) { enum mctrl_gpio_idx i; if (gpios == NULL) return; if (!gpios->mctrl_on) return; gpios->mctrl_on = false; for (i = 0; i < UART_GPIO_MAX; ++i) { if (!gpios->irq[i]) continue; if (sync) disable_irq(gpios->irq[i]); else disable_irq_nosync(gpios->irq[i]); } } /** * mctrl_gpio_disable_ms_sync - disable irqs and handling of changes to the ms * lines, and wait for any pending IRQ to be processed * @gpios: gpios to disable */ void mctrl_gpio_disable_ms_sync(struct mctrl_gpios *gpios) { mctrl_gpio_disable_ms(gpios, true); } EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms_sync); /** * mctrl_gpio_disable_ms_no_sync - disable irqs and handling of changes to the * ms lines, and return immediately * @gpios: gpios to disable */ void mctrl_gpio_disable_ms_no_sync(struct mctrl_gpios *gpios) { mctrl_gpio_disable_ms(gpios, false); } EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms_no_sync); void mctrl_gpio_enable_irq_wake(struct mctrl_gpios *gpios) { enum mctrl_gpio_idx i; if (!gpios) return; if (!gpios->mctrl_on) return; for (i = 0; i < UART_GPIO_MAX; ++i) { if (!gpios->irq[i]) continue; enable_irq_wake(gpios->irq[i]); } } EXPORT_SYMBOL_GPL(mctrl_gpio_enable_irq_wake); void mctrl_gpio_disable_irq_wake(struct mctrl_gpios *gpios) { enum mctrl_gpio_idx i; if (!gpios) return; if (!gpios->mctrl_on) return; for (i = 0; i < UART_GPIO_MAX; ++i) { if (!gpios->irq[i]) continue; disable_irq_wake(gpios->irq[i]); } } EXPORT_SYMBOL_GPL(mctrl_gpio_disable_irq_wake); MODULE_DESCRIPTION("Helpers for controlling modem lines via GPIO"); MODULE_LICENSE("GPL"); |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 | // SPDX-License-Identifier: GPL-2.0+ /* * USB Keyspan PDA / Xircom / Entrega Converter driver * * Copyright (C) 1999 - 2001 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 1999, 2000 Brian Warner <warner@lothar.com> * Copyright (C) 2000 Al Borchers <borchers@steinerpoint.com> * Copyright (C) 2020 Johan Hovold <johan@kernel.org> * * See Documentation/usb/usb-serial.rst for more information on using this * driver */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/usb/ezusb.h> #define DRIVER_AUTHOR "Brian Warner <warner@lothar.com>, Johan Hovold <johan@kernel.org>" #define DRIVER_DESC "USB Keyspan PDA Converter driver" #define KEYSPAN_TX_THRESHOLD 128 struct keyspan_pda_private { int tx_room; struct work_struct unthrottle_work; struct usb_serial *serial; struct usb_serial_port *port; }; static int keyspan_pda_write_start(struct usb_serial_port *port); #define KEYSPAN_VENDOR_ID 0x06cd #define KEYSPAN_PDA_FAKE_ID 0x0103 #define KEYSPAN_PDA_ID 0x0104 /* no clue */ /* For Xircom PGSDB9 and older Entrega version of the same device */ #define XIRCOM_VENDOR_ID 0x085a #define XIRCOM_FAKE_ID 0x8027 #define XIRCOM_FAKE_ID_2 0x8025 /* "PGMFHUB" serial */ #define ENTREGA_VENDOR_ID 0x1645 #define ENTREGA_FAKE_ID 0x8093 static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, KEYSPAN_PDA_FAKE_ID) }, { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID) }, { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID_2) }, { USB_DEVICE(ENTREGA_VENDOR_ID, ENTREGA_FAKE_ID) }, { USB_DEVICE(KEYSPAN_VENDOR_ID, KEYSPAN_PDA_ID) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, id_table_combined); static const struct usb_device_id id_table_std[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, KEYSPAN_PDA_ID) }, { } /* Terminating entry */ }; static const struct usb_device_id id_table_fake[] = { { USB_DEVICE(KEYSPAN_VENDOR_ID, KEYSPAN_PDA_FAKE_ID) }, { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID) }, { USB_DEVICE(XIRCOM_VENDOR_ID, XIRCOM_FAKE_ID_2) }, { USB_DEVICE(ENTREGA_VENDOR_ID, ENTREGA_FAKE_ID) }, { } /* Terminating entry */ }; static int keyspan_pda_get_write_room(struct keyspan_pda_private *priv) { struct usb_serial_port *port = priv->port; struct usb_serial *serial = port->serial; u8 room; int rc; rc = usb_control_msg_recv(serial->dev, 0, 6, /* write_room */ USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_IN, 0, /* value: 0 means "remaining room" */ 0, /* index */ &room, 1, 2000, GFP_KERNEL); if (rc) { dev_dbg(&port->dev, "roomquery failed: %d\n", rc); return rc; } dev_dbg(&port->dev, "roomquery says %d\n", room); return room; } static void keyspan_pda_request_unthrottle(struct work_struct *work) { struct keyspan_pda_private *priv = container_of(work, struct keyspan_pda_private, unthrottle_work); struct usb_serial_port *port = priv->port; struct usb_serial *serial = port->serial; unsigned long flags; int result; dev_dbg(&port->dev, "%s\n", __func__); /* * Ask the device to tell us when the tx buffer becomes * sufficiently empty. */ result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), 7, /* request_unthrottle */ USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT, KEYSPAN_TX_THRESHOLD, 0, /* index */ NULL, 0, 2000); if (result < 0) dev_dbg(&serial->dev->dev, "%s - error %d from usb_control_msg\n", __func__, result); /* * Need to check available space after requesting notification in case * buffer is already empty so that no notification is sent. */ result = keyspan_pda_get_write_room(priv); if (result > KEYSPAN_TX_THRESHOLD) { spin_lock_irqsave(&port->lock, flags); priv->tx_room = max(priv->tx_room, result); spin_unlock_irqrestore(&port->lock, flags); usb_serial_port_softint(port); } } static void keyspan_pda_rx_interrupt(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; unsigned int len = urb->actual_length; int retval; int status = urb->status; struct keyspan_pda_private *priv; unsigned long flags; priv = usb_get_serial_port_data(port); switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&urb->dev->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_dbg(&urb->dev->dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } if (len < 1) { dev_warn(&port->dev, "short message received\n"); goto exit; } /* see if the message is data or a status interrupt */ switch (data[0]) { case 0: /* rest of message is rx data */ if (len < 2) break; tty_insert_flip_string(&port->port, data + 1, len - 1); tty_flip_buffer_push(&port->port); break; case 1: /* status interrupt */ if (len < 2) { dev_warn(&port->dev, "short interrupt message received\n"); break; } dev_dbg(&port->dev, "rx int, d1=%d\n", data[1]); switch (data[1]) { case 1: /* modemline change */ break; case 2: /* tx unthrottle interrupt */ spin_lock_irqsave(&port->lock, flags); priv->tx_room = max(priv->tx_room, KEYSPAN_TX_THRESHOLD); spin_unlock_irqrestore(&port->lock, flags); keyspan_pda_write_start(port); usb_serial_port_softint(port); break; default: break; } break; default: break; } exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&port->dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } static void keyspan_pda_rx_throttle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; /* * Stop receiving characters. We just turn off the URB request, and * let chars pile up in the device. If we're doing hardware * flowcontrol, the device will signal the other end when its buffer * fills up. If we're doing XON/XOFF, this would be a good time to * send an XOFF, although it might make sense to foist that off upon * the device too. */ usb_kill_urb(port->interrupt_in_urb); } static void keyspan_pda_rx_unthrottle(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; /* just restart the receive interrupt URB */ if (usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL)) dev_dbg(&port->dev, "usb_submit_urb(read urb) failed\n"); } static speed_t keyspan_pda_setbaud(struct usb_serial *serial, speed_t baud) { int rc; int bindex; switch (baud) { case 110: bindex = 0; break; case 300: bindex = 1; break; case 1200: bindex = 2; break; case 2400: bindex = 3; break; case 4800: bindex = 4; break; case 9600: bindex = 5; break; case 19200: bindex = 6; break; case 38400: bindex = 7; break; case 57600: bindex = 8; break; case 115200: bindex = 9; break; default: bindex = 5; /* Default to 9600 */ baud = 9600; } rc = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), 0, /* set baud */ USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT, /* type */ bindex, /* value */ 0, /* index */ NULL, /* &data */ 0, /* size */ 2000); /* timeout */ if (rc < 0) return 0; return baud; } static int keyspan_pda_break_ctl(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; int value; int result; if (break_state == -1) value = 1; /* start break */ else value = 0; /* clear break */ result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), 4, /* set break */ USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT, value, 0, NULL, 0, 2000); if (result < 0) { dev_dbg(&port->dev, "%s - error %d from usb_control_msg\n", __func__, result); return result; } return 0; } static void keyspan_pda_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct usb_serial *serial = port->serial; speed_t speed; /* * cflag specifies lots of stuff: number of stop bits, parity, number * of data bits, baud. What can the device actually handle?: * CSTOPB (1 stop bit or 2) * PARENB (parity) * CSIZE (5bit .. 8bit) * There is minimal hw support for parity (a PSW bit seems to hold the * parity of whatever is in the accumulator). The UART either deals * with 10 bits (start, 8 data, stop) or 11 bits (start, 8 data, * 1 special, stop). So, with firmware changes, we could do: * 8N1: 10 bit * 8N2: 11 bit, extra bit always (mark?) * 8[EOMS]1: 11 bit, extra bit is parity * 7[EOMS]1: 10 bit, b0/b7 is parity * 7[EOMS]2: 11 bit, b0/b7 is parity, extra bit always (mark?) * * HW flow control is dictated by the tty->termios.c_cflags & CRTSCTS * bit. * * For now, just do baud. */ speed = tty_get_baud_rate(tty); speed = keyspan_pda_setbaud(serial, speed); if (speed == 0) { dev_dbg(&port->dev, "can't handle requested baud rate\n"); /* It hasn't changed so.. */ speed = tty_termios_baud_rate(old_termios); } /* * Only speed can change so copy the old h/w parameters then encode * the new speed. */ tty_termios_copy_hw(&tty->termios, old_termios); tty_encode_baud_rate(tty, speed, speed); } /* * Modem control pins: DTR and RTS are outputs and can be controlled. * DCD, RI, DSR, CTS are inputs and can be read. All outputs can also be * read. The byte passed is: DTR(b7) DCD RI DSR CTS RTS(b2) unused unused. */ static int keyspan_pda_get_modem_info(struct usb_serial *serial, unsigned char *value) { int rc; u8 data; rc = usb_control_msg_recv(serial->dev, 0, 3, /* get pins */ USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_IN, 0, 0, &data, 1, 2000, GFP_KERNEL); if (rc == 0) *value = data; return rc; } static int keyspan_pda_set_modem_info(struct usb_serial *serial, unsigned char value) { int rc; rc = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), 3, /* set pins */ USB_TYPE_VENDOR|USB_RECIP_INTERFACE|USB_DIR_OUT, value, 0, NULL, 0, 2000); return rc; } static int keyspan_pda_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; int rc; unsigned char status; int value; rc = keyspan_pda_get_modem_info(serial, &status); if (rc < 0) return rc; value = ((status & BIT(7)) ? TIOCM_DTR : 0) | ((status & BIT(6)) ? TIOCM_CAR : 0) | ((status & BIT(5)) ? TIOCM_RNG : 0) | ((status & BIT(4)) ? TIOCM_DSR : 0) | ((status & BIT(3)) ? TIOCM_CTS : 0) | ((status & BIT(2)) ? TIOCM_RTS : 0); return value; } static int keyspan_pda_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; struct usb_serial *serial = port->serial; int rc; unsigned char status; rc = keyspan_pda_get_modem_info(serial, &status); if (rc < 0) return rc; if (set & TIOCM_RTS) status |= BIT(2); if (set & TIOCM_DTR) status |= BIT(7); if (clear & TIOCM_RTS) status &= ~BIT(2); if (clear & TIOCM_DTR) status &= ~BIT(7); rc = keyspan_pda_set_modem_info(serial, status); return rc; } static int keyspan_pda_write_start(struct usb_serial_port *port) { struct keyspan_pda_private *priv = usb_get_serial_port_data(port); unsigned long flags; struct urb *urb; int count; int room; int rc; /* * Guess how much room is left in the device's ring buffer. If our * write will result in no room left, ask the device to give us an * interrupt when the room available rises above a threshold but also * query how much room is currently available (in case our guess was * too conservative and the buffer is already empty when the * unthrottle work is scheduled). */ /* * We might block because of: * the TX urb is in-flight (wait until it completes) * the device is full (wait until it says there is room) */ spin_lock_irqsave(&port->lock, flags); room = priv->tx_room; count = kfifo_len(&port->write_fifo); if (!test_bit(0, &port->write_urbs_free) || count == 0 || room == 0) { spin_unlock_irqrestore(&port->lock, flags); return 0; } __clear_bit(0, &port->write_urbs_free); if (count > room) count = room; if (count > port->bulk_out_size) count = port->bulk_out_size; urb = port->write_urb; count = kfifo_out(&port->write_fifo, urb->transfer_buffer, count); urb->transfer_buffer_length = count; port->tx_bytes += count; priv->tx_room -= count; spin_unlock_irqrestore(&port->lock, flags); dev_dbg(&port->dev, "%s - count = %d, txroom = %d\n", __func__, count, room); rc = usb_submit_urb(urb, GFP_ATOMIC); if (rc) { dev_dbg(&port->dev, "usb_submit_urb(write bulk) failed\n"); spin_lock_irqsave(&port->lock, flags); port->tx_bytes -= count; priv->tx_room = max(priv->tx_room, room + count); __set_bit(0, &port->write_urbs_free); spin_unlock_irqrestore(&port->lock, flags); return rc; } if (count == room) schedule_work(&priv->unthrottle_work); return count; } static void keyspan_pda_write_bulk_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned long flags; spin_lock_irqsave(&port->lock, flags); port->tx_bytes -= urb->transfer_buffer_length; __set_bit(0, &port->write_urbs_free); spin_unlock_irqrestore(&port->lock, flags); keyspan_pda_write_start(port); usb_serial_port_softint(port); } static int keyspan_pda_write(struct tty_struct *tty, struct usb_serial_port *port, const unsigned char *buf, int count) { int rc; dev_dbg(&port->dev, "%s - count = %d\n", __func__, count); if (!count) return 0; count = kfifo_in_locked(&port->write_fifo, buf, count, &port->lock); rc = keyspan_pda_write_start(port); if (rc) return rc; return count; } static void keyspan_pda_dtr_rts(struct usb_serial_port *port, int on) { struct usb_serial *serial = port->serial; if (on) keyspan_pda_set_modem_info(serial, BIT(7) | BIT(2)); else keyspan_pda_set_modem_info(serial, 0); } static int keyspan_pda_open(struct tty_struct *tty, struct usb_serial_port *port) { struct keyspan_pda_private *priv = usb_get_serial_port_data(port); int rc; /* find out how much room is in the Tx ring */ rc = keyspan_pda_get_write_room(priv); if (rc < 0) return rc; spin_lock_irq(&port->lock); priv->tx_room = rc; spin_unlock_irq(&port->lock); rc = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (rc) { dev_dbg(&port->dev, "%s - usb_submit_urb(read int) failed\n", __func__); return rc; } return 0; } static void keyspan_pda_close(struct usb_serial_port *port) { struct keyspan_pda_private *priv = usb_get_serial_port_data(port); /* * Stop the interrupt URB first as its completion handler may submit * the write URB. */ usb_kill_urb(port->interrupt_in_urb); usb_kill_urb(port->write_urb); cancel_work_sync(&priv->unthrottle_work); spin_lock_irq(&port->lock); kfifo_reset(&port->write_fifo); spin_unlock_irq(&port->lock); } /* download the firmware to a "fake" device (pre-renumeration) */ static int keyspan_pda_fake_startup(struct usb_serial *serial) { unsigned int vid = le16_to_cpu(serial->dev->descriptor.idVendor); const char *fw_name; /* download the firmware here ... */ ezusb_fx1_set_reset(serial->dev, 1); switch (vid) { case KEYSPAN_VENDOR_ID: fw_name = "keyspan_pda/keyspan_pda.fw"; break; case XIRCOM_VENDOR_ID: case ENTREGA_VENDOR_ID: fw_name = "keyspan_pda/xircom_pgs.fw"; break; default: dev_err(&serial->dev->dev, "%s: unknown vendor, aborting.\n", __func__); return -ENODEV; } if (ezusb_fx1_ihex_firmware_download(serial->dev, fw_name) < 0) { dev_err(&serial->dev->dev, "failed to load firmware \"%s\"\n", fw_name); return -ENOENT; } /* * After downloading firmware renumeration will occur in a moment and * the new device will bind to the real driver. */ /* We want this device to fail to have a driver assigned to it. */ return 1; } MODULE_FIRMWARE("keyspan_pda/keyspan_pda.fw"); MODULE_FIRMWARE("keyspan_pda/xircom_pgs.fw"); static int keyspan_pda_port_probe(struct usb_serial_port *port) { struct keyspan_pda_private *priv; priv = kmalloc(sizeof(struct keyspan_pda_private), GFP_KERNEL); if (!priv) return -ENOMEM; INIT_WORK(&priv->unthrottle_work, keyspan_pda_request_unthrottle); priv->port = port; usb_set_serial_port_data(port, priv); return 0; } static void keyspan_pda_port_remove(struct usb_serial_port *port) { struct keyspan_pda_private *priv; priv = usb_get_serial_port_data(port); kfree(priv); } static struct usb_serial_driver keyspan_pda_fake_device = { .driver = { .name = "keyspan_pda_pre", }, .description = "Keyspan PDA - (prerenumeration)", .id_table = id_table_fake, .num_ports = 1, .attach = keyspan_pda_fake_startup, }; static struct usb_serial_driver keyspan_pda_device = { .driver = { .name = "keyspan_pda", }, .description = "Keyspan PDA", .id_table = id_table_std, .num_ports = 1, .num_bulk_out = 1, .num_interrupt_in = 1, .dtr_rts = keyspan_pda_dtr_rts, .open = keyspan_pda_open, .close = keyspan_pda_close, .write = keyspan_pda_write, .write_bulk_callback = keyspan_pda_write_bulk_callback, .read_int_callback = keyspan_pda_rx_interrupt, .throttle = keyspan_pda_rx_throttle, .unthrottle = keyspan_pda_rx_unthrottle, .set_termios = keyspan_pda_set_termios, .break_ctl = keyspan_pda_break_ctl, .tiocmget = keyspan_pda_tiocmget, .tiocmset = keyspan_pda_tiocmset, .port_probe = keyspan_pda_port_probe, .port_remove = keyspan_pda_port_remove, }; static struct usb_serial_driver * const serial_drivers[] = { &keyspan_pda_device, &keyspan_pda_fake_device, NULL }; module_usb_serial_driver(serial_drivers, id_table_combined); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
115 2374 214 8 183 143 617 708 51 948 631 630 158 7 7 1012 4183 53 53 53 4544 2054 8024 5553 4333 102 1944 760 4 6 448 497 65 95 310 188 270 1375 1142 1090 135 2 33 873 3857 11985 952 12620 438 439 8493 119 9066 385 108 24 116 7872 375 102 1 687 121 2165 2160 2073 2075 2468 966 2443 966 25 1463 43 13 586 1131 562 1131 1301 73 3240 1 3240 1 5863 5246 1419 2824 5752 42 778 5127 1308 42 667 147 3395 6099 7576 4783 533 157 1052 110 79 9 39 1425 1491 672 126 160 2952 827 28 2 42 1 28 1 28 6662 437 377 437 122 24 47 2 9 169 9 234 411 300 128 58 137 40 32 143 1786 3260 1657 43 43 38 40 1610 360 358 25 185 312 311 138 138 6012 94 147 3282 1930 32 8456 3771 3241 532 237 4568 1284 20 141 1 138 14 1304 21 198 9575 477 9566 449 185 182 183 32 181 2 1 179 3 1 178 52 285 4 3 177 732 10 1249 1130 128 27 350 402 379 161 385 369 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> #include <linux/file_ref.h> #include <linux/unicode.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) /* File supports atomic writes */ #define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) /* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* FMODE_* bit 13 */ /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) #define FMODE_OPENED ((__force fmode_t)(1 << 19)) #define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) /* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)(1 << 24)) /* * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) /* * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * The two FMODE_NONOTIFY* define which fsnotify events should not be generated * for a file. These are the possible values of (f->f_mode & * FMODE_FSNOTIFY_MASK) and their meaning: * * FMODE_NONOTIFY - suppress all (incl. non-permission) events. * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. */ #define FMODE_FSNOTIFY_MASK \ (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) #define FMODE_FSNOTIFY_NONE(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS #define FMODE_FSNOTIFY_PERM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) #define FMODE_FSNOTIFY_HSM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0) #else #define FMODE_FSNOTIFY_PERM(mode) 0 #define FMODE_FSNOTIFY_HSM(mode) 0 #endif /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) #define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; /* * The two anonymous unions wrap structures with the same member. * * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which * are a dedicated type requiring the filesystem to use the dedicated * helpers. Other filesystem can continue to use ia_{g,u}id until they * have been ported. * * They always contain the same value. In other words FS_ALLOW_IDMAP * pass down the same value on idmapped mounts as they would on regular * mounts. */ union { kuid_t ia_uid; vfsuid_t ia_vfsuid; }; union { kgid_t ia_gid; vfsgid_t ia_vfsgid; }; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Includes for diskquotas. */ #include <linux/quota.h> /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe * context rather than needing to be punted through a workqueue. If this * flag is set, the bio completion handling may set iocb->dio_complete to a * handler function and iocb->private to context information for that handler. * The issuer should call the handler with that context information from task * context to complete the processing of the iocb. Note that while this * provides a task context for the dio_complete() callback, it should only be * used on the completion side for non-IO generating completions. It's fine to * call blocking functions from this callback, but they should not wait for * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) #define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ { IOCB_HIPRI, "HIPRI" }, \ { IOCB_DSYNC, "DSYNC" }, \ { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ { IOCB_ATOMIC, "ATOMIC" }, \ { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } struct kiocb { struct file *ki_filp; loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ union { /* * Only used for async buffered reads, where it denotes the * page waitqueue associated with completing the read. Valid * IFF IOCB_WAITQ is set. */ struct wait_page_queue *ki_waitq; /* * Can be used for O_DIRECT IO, where the completion handling * is punted back to the issuer of the IO. May only be set * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer * must then check for presence of this handler when ki_complete * is invoked. The data passed in to this handler must be * assigned to ->private when dio_complete is assigned. */ ssize_t (*dio_complete)(void *data); }; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @invalidate_lock: Guards coherency between page cache contents and * file offset->disk block mappings in the filesystem during invalidates. * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. * @i_private_list: For use by the owner of the address_space. * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; struct rw_semaphore invalidate_lock; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_read(struct address_space *mapping) { return down_read_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to * cache the ACL. This also means that ->get_inode_acl() can be called in RCU * mode with the LOOKUP_RCU flag. */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; time64_t i_atime_sec; time64_t i_mtime_sec; time64_t i_ctime_sec; u32 i_atime_nsec; u32 i_mtime_nsec; u32 i_ctime_nsec; u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ u32 i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; union { struct list_head i_devices; int i_linklen; }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; } /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. */ #define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit); static inline void inode_wake_up_bit(struct inode *inode, u32 bit) { /* Caller is responsible for correct memory barriers. */ wake_up_var(inode_state_wait_address(inode, bit)); } struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } /* * inode->i_mutex nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } static inline void filemap_invalidate_lock(struct address_space *mapping) { down_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock(struct address_space *mapping) { up_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_lock_shared(struct address_space *mapping) { down_read(&mapping->invalidate_lock); } static inline int filemap_invalidate_trylock_shared( struct address_space *mapping) { return down_read_trylock(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock_shared( struct address_space *mapping) { up_read(&mapping->invalidate_lock); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2); void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else /* Pairs with smp_store_release() in i_size_write() */ return smp_load_acquire(&inode->i_size); #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else /* * Pairs with smp_load_acquire() in i_size_read() to ensure * changes related to inode size (such as page contents) are * visible before we see the changed inode size. */ smp_store_release(&inode->i_size, i_size); #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /** * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. * @async_size: Numer of pages that were/are not needed immediately * and so were/are genuinely "ahead". Start next readahead when * the first of these pages is accessed. * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; unsigned int size; unsigned int async_size; unsigned int ra_pages; unsigned int mmap_miss; loff_t prev_pos; }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } /** * struct file - Represents a file * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations * @f_mapping: Contents of a cacheable, mappable object. * @private_data: filesystem or driver specific data * @f_inode: cached inode * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) * @f_ref: reference count */ struct file { spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; struct address_space *f_mapping; void *private_data; struct inode *f_inode; unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; /* pipes */ u64 f_pipe; }; loff_t f_pos; #ifdef CONFIG_SECURITY void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL struct hlist_head *f_ep; #endif union { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; freeptr_t f_freeptr; }; file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) { file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); #define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) #define OFFT_OFFSET_MAX type_max(off_t) #endif int file_f_owner_allocate(struct file *file); static inline struct fown_struct *file_f_owner(const struct file *file) { return READ_ONCE(file->f_owner); } extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } /* * file_dentry() is a relic from the days that overlayfs was using files with a * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. * In those days, file_dentry() was needed to get the underlying fs dentry that * matches f_inode. * Files with "fake" path should not exist nowadays, so use an assertion to make * sure that file_dentry() was not papering over filesystem bugs. */ static inline struct dentry *file_dentry(const struct file *file) { struct dentry *dentry = file->f_path.dentry; WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); return dentry; } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ #define SB_RDONLY BIT(0) /* Mount read-only */ #define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ #define SB_NODEV BIT(2) /* Disallow access to device special files */ #define SB_NOEXEC BIT(3) /* Disallow program execution */ #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) #define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define SB_DEAD BIT(21) #define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) #define SB_ACTIVE BIT(30) #define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) #define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) #if IS_ENABLED(CONFIG_UNICODE) #define sb_no_casefold_compat_fallback(sb) \ (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) #else #define sb_no_casefold_compat_fallback(sb) (1) #endif /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ }; #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { unsigned short frozen; /* Is sb frozen? */ int freeze_kcount; /* How many kernel freeze requests? */ int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif const struct xattr_handler * const *s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsigned int s_quota_types; /* Bitmask of supported quota types */ struct quota_info s_dquot; /* Diskquota specific options */ struct sb_writers s_writers; /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ /* Granularity of c/m/atime in ns (cannot be worse than a second) */ u32 s_time_gran; /* Time limits for c/m/atime in seconds */ time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY u32 s_fsnotify_mask; struct fsnotify_sb_info *s_fsnotify_info; #endif /* * q: why are s_id and s_sysfs_name not the same? both are human * readable strings that identify the filesystem * a: s_id is allowed to change at runtime; it's used in log messages, * and we want to when a device starts out as single device (s_id is dev * name) but then a device is hot added and we have to switch to * identifying it by UUID * but s_sysfs_name is a handle for programmatic access, and can't * change at runtime */ char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ char s_sysfs_name[UUID_STRING_LEN + 1]; unsigned int s_max_links; /* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned. */ struct mutex s_vfs_rename_mutex; /* Kludge */ /* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ const char *s_subtype; const struct dentry_operations *s_d_op; /* default d_op for dentries */ struct shrinker *s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; /* Read-only state of the superblock is being changed */ int s_readonly_remount; /* per-sb errseq_t for reporting writeback errors via syncfs */ errseq_t s_wb_err; /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; /* * Owning user namespace and default context in which to * interpret filesystem uids, gids, quotas, device nodes, * xattrs and security labels. */ struct user_namespace *s_user_ns; /* * The list_lru structure is essentially just a pointer to a table * of per-node lru lists, each of which has its own spinlock. * There is no need to put them into separate cachelines. */ struct list_lru s_dentry_lru; struct list_lru s_inode_lru; struct rcu_head rcu; struct work_struct destroy_work; struct mutex s_sync_lock; /* sync serialisation lock */ /* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; /* s_inode_list_lock protects s_inodes */ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; } /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem. */ static inline uid_t i_uid_read(const struct inode *inode) { return from_kuid(i_user_ns(inode), inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { return from_kgid(i_user_ns(inode), inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { inode->i_uid = make_kuid(i_user_ns(inode), uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { inode->i_gid = make_kgid(i_user_ns(inode), gid); } /** * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned. */ static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid); } /** * i_uid_needs_update - check whether inode's i_uid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_uid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_uid field needs to be updated, false if not. */ static inline bool i_uid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_UID) && !vfsuid_eq(attr->ia_vfsuid, i_uid_into_vfsuid(idmap, inode))); } /** * i_uid_update - update @inode's i_uid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid. */ static inline void i_uid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_UID) inode->i_uid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); } /** * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned. */ static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid); } /** * i_gid_needs_update - check whether inode's i_gid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_gid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_gid field needs to be updated, false if not. */ static inline bool i_gid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_GID) && !vfsgid_eq(attr->ia_vfsgid, i_gid_into_vfsgid(idmap, inode))); } /** * i_gid_update - update @inode's i_gid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid. */ static inline void i_gid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_GID) inode->i_gid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); } /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsuid according to @idmap. */ static inline void inode_fsuid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode)); } /** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsgid according to @idmap. */ static inline void inode_fsgid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode)); } /** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not. */ static inline bool fsuidgid_has_mapping(struct super_block *sb, struct mnt_idmap *idmap) { struct user_namespace *fs_userns = sb->s_user_ns; kuid_t kuid; kgid_t kgid; kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) return false; kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && kgid_has_mapping(fs_userns, kgid); } struct timespec64 current_time(struct inode *inode); struct timespec64 inode_set_ctime_current(struct inode *inode); struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update); static inline time64_t inode_get_atime_sec(const struct inode *inode) { return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), .tv_nsec = inode_get_atime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_atime_sec = ts.tv_sec; inode->i_atime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_atime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), .tv_nsec = inode_get_mtime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_mtime_sec = ts.tv_sec; inode->i_mtime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_mtime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_mtime_to_ts(inode, ts); } /* * Multigrain timestamps * * Conditionally use fine-grained ctime and mtime timestamps when there * are users actively observing them via getattr. The primary use-case * for this is NFS clients that use the ctime to distinguish between * different states of the file, and that are often fooled by multiple * operations that occur in the same coarse-grained timer tick. */ #define I_CTIME_QUERIED ((u32)BIT(31)) static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), .tv_nsec = inode_get_ctime_nsec(inode) }; return ts; } struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode * @inode: inode in which to set the ctime * @sec: tv_sec value to set * @nsec: tv_nsec value to set * * Set the ctime in @inode to { @sec, @nsec } */ static inline struct timespec64 inode_set_ctime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_ctime_to_ts(inode, ts); } struct timespec64 simple_inode_init_ts(struct inode *inode); /* * Snapshotting support. */ /* * These are internal functions, please use sb_start_{write,pagefault,intwrite} * instead. */ static inline void __sb_end_write(struct super_block *sb, int level) { percpu_up_read(sb->s_writers.rw_sem + level-1); } static inline void __sb_start_write(struct super_block *sb, int level) { percpu_down_read(sb->s_writers.rw_sem + level - 1); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) { return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); } #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) /** * __sb_write_started - check if sb freeze level is held * @sb: the super we write to * @level: the freeze level * * * > 0 - sb freeze level is held * * 0 - sb freeze level is not held * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) { return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); } /** * sb_write_started - check if SB_FREEZE_WRITE is held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE); } /** * sb_write_not_started - check if SB_FREEZE_WRITE is not held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_not_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; } /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_started(file_inode(file)->i_sb); } /** * file_write_not_started - check if SB_FREEZE_WRITE is not held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_not_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_not_started(file_inode(file)->i_sb); } /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem. */ static inline void sb_end_write(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_WRITE); } /** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem. */ static inline void sb_end_pagefault(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem. */ static inline void sb_end_intwrite(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_FS); } /** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_mutex (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_WRITE); } static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault */ static inline void sb_start_pagefault(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.). */ static inline void sb_start_intwrite(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_FS); } static inline bool sb_start_intwrite_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_FS); } bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); /** * struct renamedata - contains all information required for renaming * @old_mnt_idmap: idmap of the old mount the inode was found from * @old_dir: parent of source * @old_dentry: source * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { struct mnt_idmap *old_mnt_idmap; struct inode *old_dir; struct dentry *old_dentry; struct mnt_idmap *new_mnt_idmap; struct inode *new_dir; struct dentry *new_dentry; struct inode **delegated_inode; unsigned int flags; } __randomize_layout; int vfs_rename(struct renamedata *); static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define compat_ptr_ioctl NULL #endif /* * VFS file helper functions. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. * Return 'true' to keep going and 'false' if there are no more entries. */ struct dir_context; typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { filldir_t actor; loff_t pos; }; /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution */ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) /* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) /* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. * * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops */ #define COPY_FILE_SPLICE (1 << 0) struct iov_iter; struct io_uring_cmd; struct offset_ctx; typedef unsigned int __bitwise fop_flags_t; struct file_operations { struct module *owner; fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); int (*setlease)(struct file *, int, struct file_lease **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); } __randomize_layout; /* Supports async buffered reads */ #define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0)) /* Supports async buffered writes */ #define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1)) /* Supports synchronous page faults for mappings */ #define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2)) /* Supports non-exclusive O_DIRECT writes from multiple threads */ #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) /* Treat loff_t as unsigned (e.g., /dev/mem) */ #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) /* File system supports uncached read/write buffered IO */ #define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, int (*) (struct file *, struct dir_context *)); #define WRAP_DIR_ITER(x) \ static int shared_##x(struct file *file , struct dir_context *ctx) \ { return wrap_directory_iterator(file, ctx, x); } struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, int); int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { return file->f_op->mmap(file, vma); } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *dax_read_ops); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); /** * enum freeze_holder - holder of the freeze * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed * * Indicate who the owner of the freeze or thaw request is and whether * the freeze needs to be exclusive or can nest. * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the * same holder aren't allowed. It is however allowed to hold a single * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at * the same time. This is relied upon by some filesystems during online * repair or similar. */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *, enum freeze_holder who); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *, enum freeze_holder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot __rcu **(*get_dquots)(struct inode *); #endif long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); void (*shutdown)(struct super_block *sb); }; /* * Inode flags - they have no relation to superblock flags now */ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be * possible to override it selectively if you really wanted to with some * ioctl() that is not currently implemented. * * Exception: SB_RDONLY is always applied to the entire file system. * * Unfortunately, it is possible to change a filesystems flags with it mounted * with files in use. This means that all of the inodes will not have their * i_flags updated. Hence, i_flags no longer inherit the superblock mount * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) #define IS_DIRSYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \ ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK) #define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME) #define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) #ifdef CONFIG_FS_POSIX_ACL #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) #else #define IS_POSIXACL(inode) 0 #endif #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) #ifdef CONFIG_SWAP #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #else #define IS_SWAPFILE(inode) ((void)(inode), 0U) #endif #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) { return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)); } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = filp->f_iocb_flags, .ki_ioprio = get_current_ioprio(), }; } static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; } /* * Inode state bits. Protected by inode->i_lock * * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync() (unless I_DIRTY_DATASYNC is also set). * Timestamp updates are the usual cause. * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. * I_DIRTY_TIME The inode itself has dirty timestamps, and the * lazytime mount option is enabled. We keep track of this * separately from I_DIRTY_SYNC in order to implement * lazytime. This gets cleared if I_DIRTY_INODE * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already * in place because writeback might already be in progress * and we don't want to lose the time update * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and * wait for I_NEW to be released before returning. * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can * also cause waiting on I_NEW, without I_NEW actually * being set. find_inode() uses this to prevent returning * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. * I_FREEING Set when inode is about to be freed but still has dirty * pages or buffers attached or the inode itself is still * dirty. * I_CLEAR Added by clear_inode(). In this state the inode is * clean and can be destroyed. Inode keeps I_FREEING. * * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * * I_SYNC Writeback of inode is running. The bit is set during * data writeback, and cleared with a wakeup on the bit * address once it is done. The bit is also used to pin * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. * * I_CREATING New object's inode in the middle of setting up. * * I_DONTCACHE Evict inode as soon as it is not used anymore. * * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait * upon. There's one free address left. */ #define __I_NEW 0 #define I_NEW (1 << __I_NEW) #define __I_SYNC 1 #define I_SYNC (1 << __I_SYNC) #define __I_LRU_ISOLATING 2 #define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_SYNC (1 << 3) #define I_DIRTY_DATASYNC (1 << 4) #define I_DIRTY_PAGES (1 << 5) #define I_WILL_FREE (1 << 6) #define I_FREEING (1 << 7) #define I_CLEAR (1 << 8) #define I_REFERENCED (1 << 9) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define I_WB_SWITCH (1 << 12) #define I_OVL_INUSE (1 << 13) #define I_CREATING (1 << 14) #define I_DONTCACHE (1 << 15) #define I_SYNC_QUEUED (1 << 16) #define I_PINNING_NETFS_WB (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY); } static inline void mark_inode_dirty_sync(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY_SYNC); } /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. * Filesystems should call this if when writing an inode when lazytime is * enabled, they want to opportunistically write the timestamps of other inodes * located very nearby on-disk, e.g. in the same inode block. This returns true * if the given inode is in need of such an opportunistic update. Requires * i_lock, or at least later re-checking under i_lock. */ static inline bool inode_is_dirtytime_only(struct inode *inode) { return (inode->i_state & (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); extern void drop_nlink(struct inode *inode); extern void clear_nlink(struct inode *inode); extern void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { inc_nlink(inode); mark_inode_dirty(inode); } static inline void inode_dec_link_count(struct inode *inode) { drop_nlink(inode); mark_inode_dirty(inode); } enum file_time_flags { S_ATIME = 1, S_MTIME = 2, S_CTIME = 4, S_VERSION = 8, }; extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); int inode_update_time(struct inode *inode, int flags); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) touch_atime(&file->f_path); } extern int file_modified(struct file *file); int kiocb_modified(struct kiocb *iocb); int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; int fs_flags; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key invalidate_lock_key; struct lock_class_key i_mutex_dir_key; }; #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) /** * is_mgtime: is this inode using multigrain timestamps * @inode: inode to test for multigrain timestamps * * Return true if the inode uses multigrain timestamps, false otherwise. */ static inline bool is_mgtime(const struct inode *inode) { return inode->i_opflags & IOP_MGTIME; } extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int set_anon_super_fc(struct super_block *s, struct fs_context *fc); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) ({ \ const struct file_operations *_fops = (fops); \ (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \ }) #define fops_put(fops) ({ \ const struct file_operations *_fops = (fops); \ if (_fops) \ module_put((_fops)->owner); \ }) /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well. */ #define replace_fops(f, fops) \ do { \ struct file *__file = (f); \ fops_put(__file->f_op); \ BUG_ON(!(__file->f_op = (fops))); \ } while(0) extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); int freeze_super(struct super_block *super, enum freeze_holder who); int thaw_super(struct super_block *super, enum freeze_holder who); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len) { if (WARN_ON(len > sizeof(sb->s_uuid))) len = sizeof(sb->s_uuid); sb->s_uuid_len = len; memcpy(&sb->s_uuid, uuid, len); } /* set sb sysfs name based on sb->s_bdev */ static inline void super_set_sysfs_name_bdev(struct super_block *sb) { snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); } /* set sb sysfs name based on sb->s_uuid */ static inline void super_set_sysfs_name_uuid(struct super_block *sb) { WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); } /* set sb sysfs name based on sb->s_id */ static inline void super_set_sysfs_name_id(struct super_block *sb) { strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); } /* try to use something standard before you use this */ __printf(2, 3) static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); va_end(args); } extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); /* /sys/fs */ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) /* fs/open.c */ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ atomic_t refcnt; struct audit_names *aname; const char iname[]; }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); static inline struct mnt_idmap *file_mnt_idmap(const struct file *file) { return mnt_idmap(file->f_path.mnt); } /** * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped. * * Return: true if mount is mapped, false if not. */ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) { return mnt_idmap(mnt) != &nop_mnt_idmap; } int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, const char *, int, umode_t); static inline struct file *file_open_root_mnt(struct vfsmount *mnt, const char *name, int flags, umode_t mode) { return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, name, flags, mode); } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct path *backing_file_user_path(struct file *f); /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying * filesystem. When the mapped file path and inode number are displayed to * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the * path and inode number to display to the user, which is the path of the fd * that user has requested to map and the inode number that would be returned * by fstat() on that same fd. */ /* Get the path to display in /proc/<pid>/maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } /* Get the inode whose inode number to display in /proc/<pid>/maps */ static inline const struct inode *file_user_inode(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return d_inode(backing_file_user_path(f)->dentry); return file_inode(f); } static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); } extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); static inline struct filename *getname(const char __user *name) { return getname_flags(name, 0); } extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) { if (!(flags & AT_EMPTY_PATH)) return getname(name); if (!name) return NULL; return __getname_maybe_null(name); } extern void putname(struct filename *name); DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T)) static inline struct filename *refname(struct filename *name) { atomic_inc(&name->refcnt); return name; } extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); /* Helper for the simple case when original dentry is used */ static inline int finish_open_simple(struct file *file, int error) { if (error) return error; return finish_open(file, file->f_path.dentry, NULL); } /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) extern struct super_block *blockdev_superblock; static inline bool sb_is_blkdev_sb(struct super_block *sb) { return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; } void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; /* fs/char_dev.c */ #define CHRDEV_MAJOR_MAX 512 /* Marks the bottom of the first segment of free char majors */ #define CHRDEV_MAJOR_DYN_END 234 /* Marks the top and bottom of the second segment of free char majors */ #define CHRDEV_MAJOR_DYN_EXT_START 511 #define CHRDEV_MAJOR_DYN_EXT_END 384 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); extern int register_chrdev_region(dev_t, unsigned, const char *); extern int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops); extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name); extern void unregister_chrdev_region(dev_t, unsigned); extern void chrdev_show(struct seq_file *,off_t); static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { return __register_chrdev(major, 0, 256, name, fops); } static inline void unregister_chrdev(unsigned int major, const char *name) { __unregister_chrdev(major, 0, 256, name); } extern void init_special_inode(struct inode *, umode_t, dev_t); /* Invalid inode operations -- fs/bad_inode.c */ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) { return file_write_and_wait_range(file, 0, LLONG_MAX); } extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); static inline bool iocb_is_dsync(const struct kiocb *iocb) { return (iocb->ki_flags & IOCB_DSYNC) || IS_SYNC(iocb->ki_filp->f_mapping->host); } /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount * of bytes passed in, or an error if syncing the file failed. */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, iocb->ki_pos - 1); } return count; } extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK extern int bmap(struct inode *inode, sector_t *block); #else static inline int bmap(struct inode *inode, sector_t *block) { return -EINVAL; } #endif int notify_change(struct mnt_idmap *, struct dentry *, struct iattr *, struct inode **); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) { return inode_permission(file_mnt_idmap(file), file_inode(file), mask); } static inline int path_permission(const struct path *path, int mask) { return inode_permission(mnt_idmap(path->mnt), d_inode(path->dentry), mask); } int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) { return (inode->i_mode ^ mode) & S_IFMT; } /** * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * * This is a variant of sb_start_write() which is a noop on non-regualr file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_start_write_trylock(file_inode(file)->i_sb); } /** * file_end_write - drop write access to a superblock of a regular file * @file: the file we wrote to * * Should be matched with a call to file_start_write(). */ static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_end_write(file_inode(file)->i_sb); } /** * kiocb_start_write - get write access to a superblock for async file io * @iocb: the io context we want to submit the write with * * This is a variant of sb_start_write() for async io submission. * Should be matched with a call to kiocb_end_write(). */ static inline void kiocb_start_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); sb_start_write(inode->i_sb); /* * Fool lockdep by telling it the lock got released so that it * doesn't complain about the held lock when we return to userspace. */ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); } /** * kiocb_end_write - drop write access to a superblock after async file io * @iocb: the io context we sumbitted the write with * * Should be matched with a call to kiocb_start_write(). */ static inline void kiocb_end_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); /* * Tell lockdep we inherited freeze protection from submission thread. */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); sb_end_write(inode->i_sb); } /* * This is used for regular files where some users -- especially the * currently executed binary in a process, previously handled via * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap * read-write shared) accesses. * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. * deny_write_access() denies write access to a file. * allow_write_access() re-enables write access to a file. * * The i_writecount field of an inode can have the following values: * 0: no write access, no denied write access * < 0: (-i_writecount) users that denied write access to the file. * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to * use {get,deny}_write_access() - these functions check the sign and refuse * to do the change if sign is wrong. */ static inline int get_write_access(struct inode *inode) { return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline int deny_write_access(struct file *file) { struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); } static inline void allow_write_access(struct file *file) { if (file) atomic_inc(&file_inode(file)->i_writecount); } /* * Do not prevent write to executable file when watched by pre-content events. * * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at * the time of file open and remains constant for entire lifetime of the file, * so if pre-content watches are added post execution or removed before the end * of the execution, it will not cause i_writecount reference leak. */ static inline int exe_file_deny_write_access(struct file *exe_file) { if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return 0; return deny_write_access(exe_file); } static inline void exe_file_allow_write_access(struct file *exe_file) { if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return; allow_write_access(exe_file); } static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) { file->f_mode &= ~FMODE_FSNOTIFY_MASK; file->f_mode |= mode; } static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; } #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { BUG_ON(atomic_dec_return(&inode->i_readcount) < 0); } static inline void i_readcount_inc(struct inode *inode) { atomic_inc(&inode->i_readcount); } #else static inline void i_readcount_dec(struct inode *inode) { return; } static inline void i_readcount_inc(struct inode *inode) { return; } #endif extern int do_pipe_flags(int *, int); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); /** * is_dot_dotdot - returns true only if @name is "." or ".." * @name: file name to check * @len: length of file name, in bytes */ static inline bool is_dot_dotdot(const char *name, size_t len) { return len && unlikely(name[0] == '.') && (len == 1 || (len == 2 && name[1] == '.')); } #include <linux/err.h> /* needed for stackable file system support */ extern loff_t default_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } extern void inode_init_once(struct inode *); extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); struct inode *iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); struct inode *iget5_locked_rcu(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); extern struct inode *find_inode_nowait(struct super_block *, unsigned long, int (*match)(struct inode *, unsigned long, void *), void *data); extern struct inode *find_inode_rcu(struct super_block *, unsigned long, int (*)(struct inode *, void *), void *); extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* * Userspace may rely on the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the * lower 32 bits, so we need to check that those aren't zero explicitly. With * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but * better safe than sorry. */ static inline bool is_zero_ino(ino_t ino) { return (u32)ino == 0; } /* * inode->i_lock must be held */ static inline void __iget(struct inode *inode) { atomic_inc(&inode->i_count); } extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); struct inode *alloc_inode(struct super_block *sb); static inline struct inode *new_inode_pseudo(struct super_block *sb) { return alloc_inode(sb); } extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); /* * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); int generic_write_checks_count(struct kiocb *iocb, loff_t *count); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter); ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); /* fs/splice.c */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof); loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie); extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); int rw_verify_area(int, struct file *, const loff_t *, size_t); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, loff_t file_offset); enum { /* need locking between buffered and direct access */ DIO_LOCKING = 0x01, /* filesystem does not support filling holes */ DIO_SKIP_HOLES = 0x02, }; ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif bool inode_dio_finished(const struct inode *inode); void inode_dio_wait(struct inode *inode); void inode_dio_wait_interruptible(struct inode *inode); /** * inode_dio_begin - signal start of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_begin(struct inode *inode) { atomic_inc(&inode->i_dio_count); } /** * inode_dio_end - signal finish of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_var(&inode->i_dio_count); } extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link_raw(struct dentry *, struct inode *, struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, unsigned int unit_max); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); static inline loff_t __inode_get_bytes(struct inode *inode) { return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; } loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); const char *simple_get_link(struct dentry *, struct inode *, struct delayed_call *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, filename, stat, 0); } static inline int vfs_lstat(const char __user *name, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); extern int simple_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int simple_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); extern int simple_nosetlease(struct file *, int, struct file_lease **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; extern void make_empty_dir_inode(struct inode *inode); extern bool is_empty_dir_inode(struct inode *inode); struct tree_descr { const char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); extern int simple_fill_super(struct super_block *, unsigned long, const struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { struct maple_tree mt; unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); void simple_offset_destroy(struct offset_ctx *octx); extern const struct file_operations simple_offset_dir_operations; extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_sb_d_ops(struct super_block *sb); extern int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len); #if IS_ENABLED(CONFIG_UNICODE) int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /** * generic_ci_validate_strict_name - Check if a given name is suitable * for a directory * * This functions checks if the proposed filename is valid for the * parent directory. That means that only valid UTF-8 filenames will be * accepted for casefold directories from filesystems created with the * strict encoding flag. That also means that any name will be * accepted for directories that doesn't have casefold enabled, or * aren't being strict with the encoding. * * @dir: inode of the directory where the new file will be created * @name: name of the new file * * Return: * * True: if the filename is suitable for this directory. It can be * true if a given name is not suitable for a strict encoding * directory, but the directory being used isn't strict * * False if the filename isn't suitable for this directory. This only * happens when a directory is casefolded and the filesystem is strict * about its encoding. */ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) return true; /* * A casefold dir must have a encoding set, unless the filesystem * is corrupted */ if (WARN_ON_ONCE(!dir->i_sb->s_encoding)) return true; return !utf8_validate(dir->i_sb->s_encoding, name); } #else static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { return true; } #endif static inline bool sb_has_encoding(const struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) return !!sb->s_encoding; #else return false; #endif } int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); void setattr_copy(struct mnt_idmap *, struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } static inline bool vma_is_fsdax(struct vm_area_struct *vma) { struct inode *inode; if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file) return false; if (!vma_is_dax(vma)) return false; inode = file_inode(vma->vm_file); if (S_ISCHR(inode->i_mode)) return false; /* device-dax */ return true; } static inline int iocb_flags(struct file *file) { int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if (file->f_flags & O_DSYNC) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; return res; } static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int rw_type) { int kiocb_flags = 0; /* make sure there's no overlap between RWF and private IOCB flags */ BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD); if (!flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) return -EOPNOTSUPP; if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } if (flags & RWF_DONTCACHE) { /* file system must support it */ if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) return -EOPNOTSUPP; /* DAX mappings not supported */ if (IS_DAX(ki->ki_filp->f_mapping->host)) return -EOPNOTSUPP; } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { if (IS_APPEND(file_inode(ki->ki_filp))) return -EPERM; ki->ki_flags &= ~IOCB_APPEND; } ki->ki_flags |= kiocb_flags; return 0; } /* Transaction based IO helpers */ /* * An argresp is stored in an allocated page and holds the * size of the argument or response, along with its content */ struct simple_transaction_argresp { ssize_t size; char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) char *simple_transaction_get(struct file *file, const char __user *buf, size_t size); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); void simple_transaction_set(struct file *file, size_t n); /* * simple attribute files * * These attributes behave similar to those in sysfs: * * Writing to an attribute immediately sets a value, an open file can be * written to multiple times. * * Reading from an attribute creates a buffer from the value that might get * read with multiple read calls. When the attribute has been read * completely, no further read calls are possible until the file is opened * again. * * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ #define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \ } \ static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) #define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { /* don't do anything, just let the compiler check the arguments; */ } int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt); int simple_attr_release(struct inode *inode, struct file *file); ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; return __check_sticky(idmap, dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC)) inode->i_flags |= S_NOSEC; } static inline bool is_root_inode(struct inode *inode) { return inode == inode->i_sb->s_root->d_inode; } static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { return ctx->actor(ctx, name, namelen, ctx->pos, ino, type); } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, ".", 1, ctx->pos, file->f_path.dentry->d_inode->i_ino, DT_DIR); } static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) return false; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit_dotdot(file, ctx)) return false; ctx->pos = 2; } return true; } static inline bool dir_relax(struct inode *inode) { inode_unlock(inode); inode_lock(inode); return !IS_DEADDIR(inode); } static inline bool dir_relax_shared(struct inode *inode) { inode_unlock_shared(inode); inode_lock_shared(inode); return !IS_DEADDIR(inode); } extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); /* mm/fadvise.c */ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); static inline bool vfs_empty_path(int dfd, const char __user *path) { char c; if (dfd < 0) return false; /* We now allow NULL to be used for empty path. */ if (!path) return true; if (unlikely(get_user(c, path))) return false; return !c; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */ |
2 17 10 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: af_phonet.h * * Phonet sockets kernel definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef AF_PHONET_H #define AF_PHONET_H #include <linux/phonet.h> #include <linux/skbuff.h> #include <net/sock.h> /* * The lower layers may not require more space, ever. Make sure it's * enough. */ #define MAX_PHONET_HEADER (8 + MAX_HEADER) /* * Every Phonet* socket has this structure first in its * protocol-specific structure under name c. */ struct pn_sock { struct sock sk; u16 sobject; u16 dobject; u8 resource; }; static inline struct pn_sock *pn_sk(struct sock *sk) { return (struct pn_sock *)sk; } extern const struct proto_ops phonet_dgram_ops; void pn_sock_init(void); struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa); void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb); void phonet_get_local_port_range(int *min, int *max); int pn_sock_hash(struct sock *sk); void pn_sock_unhash(struct sock *sk); int pn_sock_get_port(struct sock *sk, unsigned short sport); struct sock *pn_find_sock_by_res(struct net *net, u8 res); int pn_sock_bind_res(struct sock *sock, u8 res); int pn_sock_unbind_res(struct sock *sk, u8 res); void pn_sock_unbind_all_res(struct sock *sk); int pn_skb_send(struct sock *sk, struct sk_buff *skb, const struct sockaddr_pn *target); static inline struct phonethdr *pn_hdr(struct sk_buff *skb) { return (struct phonethdr *)skb_network_header(skb); } static inline struct phonetmsg *pn_msg(struct sk_buff *skb) { return (struct phonetmsg *)skb_transport_header(skb); } /* * Get the other party's sockaddr from received skb. The skb begins * with a Phonet header. */ static inline void pn_skb_get_src_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa) { struct phonethdr *ph = pn_hdr(skb); u16 obj = pn_object(ph->pn_sdev, ph->pn_sobj); sa->spn_family = AF_PHONET; pn_sockaddr_set_object(sa, obj); pn_sockaddr_set_resource(sa, ph->pn_res); memset(sa->spn_zero, 0, sizeof(sa->spn_zero)); } static inline void pn_skb_get_dst_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa) { struct phonethdr *ph = pn_hdr(skb); u16 obj = pn_object(ph->pn_rdev, ph->pn_robj); sa->spn_family = AF_PHONET; pn_sockaddr_set_object(sa, obj); pn_sockaddr_set_resource(sa, ph->pn_res); memset(sa->spn_zero, 0, sizeof(sa->spn_zero)); } /* Protocols in Phonet protocol family. */ struct phonet_protocol { const struct proto_ops *ops; struct proto *prot; int sock_type; }; int phonet_proto_register(unsigned int protocol, const struct phonet_protocol *pp); void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp); int phonet_sysctl_init(void); void phonet_sysctl_exit(void); int isi_register(void); void isi_unregister(void); static inline bool sk_is_phonet(struct sock *sk) { return sk->sk_family == PF_PHONET; } static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int karg; switch (cmd) { case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: if (get_user(karg, (int __user *)arg)) return -EFAULT; return sk->sk_prot->ioctl(sk, cmd, &karg); } /* A positive return value means that the ioctl was not processed */ return 1; } #endif |
26 26 26 26 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 | // SPDX-License-Identifier: GPL-2.0 /* * Block device concurrent positioning ranges. * * Copyright (C) 2021 Western Digital Corporation or its Affiliates. */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/init.h> #include "blk.h" static ssize_t blk_ia_range_sector_show(struct blk_independent_access_range *iar, char *buf) { return sprintf(buf, "%llu\n", iar->sector); } static ssize_t blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar, char *buf) { return sprintf(buf, "%llu\n", iar->nr_sectors); } struct blk_ia_range_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_independent_access_range *iar, char *buf); }; static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { .attr = { .name = "sector", .mode = 0444 }, .show = blk_ia_range_sector_show, }; static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { .attr = { .name = "nr_sectors", .mode = 0444 }, .show = blk_ia_range_nr_sectors_show, }; static struct attribute *blk_ia_range_attrs[] = { &blk_ia_range_sector_entry.attr, &blk_ia_range_nr_sectors_entry.attr, NULL, }; ATTRIBUTE_GROUPS(blk_ia_range); static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct blk_ia_range_sysfs_entry *entry = container_of(attr, struct blk_ia_range_sysfs_entry, attr); struct blk_independent_access_range *iar = container_of(kobj, struct blk_independent_access_range, kobj); return entry->show(iar, buf); } static const struct sysfs_ops blk_ia_range_sysfs_ops = { .show = blk_ia_range_sysfs_show, }; /* * Independent access range entries are not freed individually, but alltogether * with struct blk_independent_access_ranges and its array of ranges. Since * kobject_add() takes a reference on the parent kobject contained in * struct blk_independent_access_ranges, the array of independent access range * entries cannot be freed until kobject_del() is called for all entries. * So we do not need to do anything here, but still need this no-op release * operation to avoid complaints from the kobject code. */ static void blk_ia_range_sysfs_nop_release(struct kobject *kobj) { } static const struct kobj_type blk_ia_range_ktype = { .sysfs_ops = &blk_ia_range_sysfs_ops, .default_groups = blk_ia_range_groups, .release = blk_ia_range_sysfs_nop_release, }; /* * This will be executed only after all independent access range entries are * removed with kobject_del(), at which point, it is safe to free everything, * including the array of ranges. */ static void blk_ia_ranges_sysfs_release(struct kobject *kobj) { struct blk_independent_access_ranges *iars = container_of(kobj, struct blk_independent_access_ranges, kobj); kfree(iars); } static const struct kobj_type blk_ia_ranges_ktype = { .release = blk_ia_ranges_sysfs_release, }; /** * disk_register_independent_access_ranges - register with sysfs a set of * independent access ranges * @disk: Target disk * * Register with sysfs a set of independent access ranges for @disk. */ int disk_register_independent_access_ranges(struct gendisk *disk) { struct blk_independent_access_ranges *iars = disk->ia_ranges; struct request_queue *q = disk->queue; int i, ret; lockdep_assert_held(&q->sysfs_lock); if (!iars) return 0; /* * At this point, iars is the new set of sector access ranges that needs * to be registered with sysfs. */ WARN_ON(iars->sysfs_registered); ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, &disk->queue_kobj, "%s", "independent_access_ranges"); if (ret) { disk->ia_ranges = NULL; kobject_put(&iars->kobj); return ret; } for (i = 0; i < iars->nr_ia_ranges; i++) { ret = kobject_init_and_add(&iars->ia_range[i].kobj, &blk_ia_range_ktype, &iars->kobj, "%d", i); if (ret) { while (--i >= 0) kobject_del(&iars->ia_range[i].kobj); kobject_del(&iars->kobj); kobject_put(&iars->kobj); return ret; } } iars->sysfs_registered = true; return 0; } void disk_unregister_independent_access_ranges(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_independent_access_ranges *iars = disk->ia_ranges; int i; lockdep_assert_held(&q->sysfs_lock); if (!iars) return; if (iars->sysfs_registered) { for (i = 0; i < iars->nr_ia_ranges; i++) kobject_del(&iars->ia_range[i].kobj); kobject_del(&iars->kobj); kobject_put(&iars->kobj); } else { kfree(iars); } disk->ia_ranges = NULL; } static struct blk_independent_access_range * disk_find_ia_range(struct blk_independent_access_ranges *iars, sector_t sector) { struct blk_independent_access_range *iar; int i; for (i = 0; i < iars->nr_ia_ranges; i++) { iar = &iars->ia_range[i]; if (sector >= iar->sector && sector < iar->sector + iar->nr_sectors) return iar; } return NULL; } static bool disk_check_ia_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars) { struct blk_independent_access_range *iar, *tmp; sector_t capacity = get_capacity(disk); sector_t sector = 0; int i; if (WARN_ON_ONCE(!iars->nr_ia_ranges)) return false; /* * While sorting the ranges in increasing LBA order, check that the * ranges do not overlap, that there are no sector holes and that all * sectors belong to one range. */ for (i = 0; i < iars->nr_ia_ranges; i++) { tmp = disk_find_ia_range(iars, sector); if (!tmp || tmp->sector != sector) { pr_warn("Invalid non-contiguous independent access ranges\n"); return false; } iar = &iars->ia_range[i]; if (tmp != iar) { swap(iar->sector, tmp->sector); swap(iar->nr_sectors, tmp->nr_sectors); } sector += iar->nr_sectors; } if (sector != capacity) { pr_warn("Independent access ranges do not match disk capacity\n"); return false; } return true; } static bool disk_ia_ranges_changed(struct gendisk *disk, struct blk_independent_access_ranges *new) { struct blk_independent_access_ranges *old = disk->ia_ranges; int i; if (!old) return true; if (old->nr_ia_ranges != new->nr_ia_ranges) return true; for (i = 0; i < old->nr_ia_ranges; i++) { if (new->ia_range[i].sector != old->ia_range[i].sector || new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors) return true; } return false; } /** * disk_alloc_independent_access_ranges - Allocate an independent access ranges * data structure * @disk: target disk * @nr_ia_ranges: Number of independent access ranges * * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges * access range descriptors. */ struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges) { struct blk_independent_access_ranges *iars; iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges), GFP_KERNEL, disk->queue->node); if (iars) iars->nr_ia_ranges = nr_ia_ranges; return iars; } EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges); /** * disk_set_independent_access_ranges - Set a disk independent access ranges * @disk: target disk * @iars: independent access ranges structure * * Set the independent access ranges information of the request queue * of @disk to @iars. If @iars is NULL and the independent access ranges * structure already set is cleared. If there are no differences between * @iars and the independent access ranges structure already set, @iars * is freed. */ void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars) { struct request_queue *q = disk->queue; mutex_lock(&q->sysfs_lock); if (iars && !disk_check_ia_ranges(disk, iars)) { kfree(iars); iars = NULL; } if (iars && !disk_ia_ranges_changed(disk, iars)) { kfree(iars); goto unlock; } /* * This may be called for a registered queue. E.g. during a device * revalidation. If that is the case, we need to unregister the old * set of independent access ranges and register the new set. If the * queue is not registered, registration of the device request queue * will register the independent access ranges. */ disk_unregister_independent_access_ranges(disk); disk->ia_ranges = iars; if (blk_queue_registered(q)) disk_register_independent_access_ranges(disk); unlock: mutex_unlock(&q->sysfs_lock); } EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges); |
97 14 29 16 29 1 15 15 12 15 15 3 3 15 15 15 15 15 15 8 6 5 4 8 16 53 54 19 18 19 36 36 35 36 29 7 36 54 54 25 5 5 5 20 19 20 20 20 20 109 108 109 33 3 1 3 108 108 108 37 37 37 37 88 108 111 112 89 39 110 111 108 8 8 8 6 13 12 3 2 1 9 9 9 9 9 7 7 7 6 6 3 9 13 35 45 45 2 2 1 2 22 21 3 22 18 4 3 2 2 1 2 17 14 13 13 17 2 15 2 13 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 | /* * linux/fs/hfs/inode.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains inode-related functions which do not depend on * which scheme is being used to represent forks. * * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds */ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/uio.h> #include <linux/xattr.h> #include <linux/blkdev.h> #include "hfs_fs.h" #include "btree.h" static const struct file_operations hfs_file_operations; static const struct inode_operations hfs_file_inode_operations; /*================ Variable-like macros ================*/ #define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO) static int hfs_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, hfs_get_block); } static void hfs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); hfs_file_truncate(inode); } } int hfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret; ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata, hfs_get_block, &HFS_I(mapping->host)->phys_size); if (unlikely(ret)) hfs_write_failed(mapping, pos + len); return ret; } static sector_t hfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, hfs_get_block); } static bool hfs_release_folio(struct folio *folio, gfp_t mask) { struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct hfs_btree *tree; struct hfs_bnode *node; u32 nidx; int i; bool res = true; switch (inode->i_ino) { case HFS_EXT_CNID: tree = HFS_SB(sb)->ext_tree; break; case HFS_CAT_CNID: tree = HFS_SB(sb)->cat_tree; break; default: BUG(); return false; } if (!tree) return false; if (tree->node_size >= PAGE_SIZE) { nidx = folio->index >> (tree->node_size_shift - PAGE_SHIFT); spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, nidx); if (!node) ; else if (atomic_read(&node->refcnt)) res = false; if (res && node) { hfs_bnode_unhash(node); hfs_bnode_free(node); } spin_unlock(&tree->hash_lock); } else { nidx = folio->index << (PAGE_SHIFT - tree->node_size_shift); i = 1 << (PAGE_SHIFT - tree->node_size_shift); spin_lock(&tree->hash_lock); do { node = hfs_bnode_findhash(tree, nidx++); if (!node) continue; if (atomic_read(&node->refcnt)) { res = false; break; } hfs_bnode_unhash(node); hfs_bnode_free(node); } while (--i && nidx < tree->node_count); spin_unlock(&tree->hash_lock); } return res ? try_to_free_buffers(folio) : false; } static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = iocb->ki_pos + count; if (end > isize) hfs_write_failed(mapping, end); } return ret; } static int hfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, hfs_get_block); } const struct address_space_operations hfs_btree_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfs_read_folio, .writepages = hfs_writepages, .write_begin = hfs_write_begin, .write_end = generic_write_end, .migrate_folio = buffer_migrate_folio, .bmap = hfs_bmap, .release_folio = hfs_release_folio, }; const struct address_space_operations hfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfs_read_folio, .write_begin = hfs_write_begin, .write_end = generic_write_end, .bmap = hfs_bmap, .direct_IO = hfs_direct_IO, .writepages = hfs_writepages, .migrate_folio = buffer_migrate_folio, }; /* * hfs_new_inode */ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode = new_inode(sb); if (!inode) return NULL; mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); spin_lock_init(&HFS_I(inode)->open_dir_lock); hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name); inode->i_ino = HFS_SB(sb)->next_id++; inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); set_nlink(inode, 1); simple_inode_init_ts(inode); HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; if (S_ISDIR(mode)) { inode->i_size = 2; HFS_SB(sb)->folder_count++; if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_dirs++; inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; inode->i_mode |= S_IRWXUGO; inode->i_mode &= ~HFS_SB(inode->i_sb)->s_dir_umask; } else if (S_ISREG(mode)) { HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; HFS_SB(sb)->file_count++; if (dir->i_ino == HFS_ROOT_CNID) HFS_SB(sb)->root_files++; inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; inode->i_mode |= S_IRUGO|S_IXUGO; if (mode & S_IWUSR) inode->i_mode |= S_IWUGO; inode->i_mode &= ~HFS_SB(inode->i_sb)->s_file_umask; HFS_I(inode)->phys_size = 0; HFS_I(inode)->alloc_blocks = 0; HFS_I(inode)->first_blocks = 0; HFS_I(inode)->cached_start = 0; HFS_I(inode)->cached_blocks = 0; memset(HFS_I(inode)->first_extents, 0, sizeof(hfs_extent_rec)); memset(HFS_I(inode)->cached_extents, 0, sizeof(hfs_extent_rec)); } insert_inode_hash(inode); mark_inode_dirty(inode); set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); return inode; } void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { HFS_SB(sb)->folder_count--; if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_dirs--; set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); return; } HFS_SB(sb)->file_count--; if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) HFS_SB(sb)->root_files--; if (S_ISREG(inode->i_mode)) { if (!inode->i_nlink) { inode->i_size = 0; hfs_file_truncate(inode); } } set_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); } void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 __log_size, __be32 phys_size, u32 clump_size) { struct super_block *sb = inode->i_sb; u32 log_size = be32_to_cpu(__log_size); u16 count; int i; memcpy(HFS_I(inode)->first_extents, ext, sizeof(hfs_extent_rec)); for (count = 0, i = 0; i < 3; i++) count += be16_to_cpu(ext[i].count); HFS_I(inode)->first_blocks = count; HFS_I(inode)->cached_start = 0; HFS_I(inode)->cached_blocks = 0; inode->i_size = HFS_I(inode)->phys_size = log_size; HFS_I(inode)->fs_blocks = (log_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; inode_set_bytes(inode, HFS_I(inode)->fs_blocks << sb->s_blocksize_bits); HFS_I(inode)->alloc_blocks = be32_to_cpu(phys_size) / HFS_SB(sb)->alloc_blksz; HFS_I(inode)->clump_blocks = clump_size / HFS_SB(sb)->alloc_blksz; if (!HFS_I(inode)->clump_blocks) HFS_I(inode)->clump_blocks = HFS_SB(sb)->clumpablks; } struct hfs_iget_data { struct hfs_cat_key *key; hfs_cat_rec *rec; }; static int hfs_test_inode(struct inode *inode, void *data) { struct hfs_iget_data *idata = data; hfs_cat_rec *rec; rec = idata->rec; switch (rec->type) { case HFS_CDR_DIR: return inode->i_ino == be32_to_cpu(rec->dir.DirID); case HFS_CDR_FIL: return inode->i_ino == be32_to_cpu(rec->file.FlNum); default: BUG(); return 1; } } /* * hfs_read_inode */ static int hfs_read_inode(struct inode *inode, void *data) { struct hfs_iget_data *idata = data; struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); hfs_cat_rec *rec; HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; mutex_init(&HFS_I(inode)->extents_lock); INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list); spin_lock_init(&HFS_I(inode)->open_dir_lock); /* Initialize the inode */ inode->i_uid = hsb->s_uid; inode->i_gid = hsb->s_gid; set_nlink(inode, 1); if (idata->key) HFS_I(inode)->cat_key = *idata->key; else HFS_I(inode)->flags |= HFS_FLG_RSRC; HFS_I(inode)->tz_secondswest = sys_tz.tz_minuteswest * 60; rec = idata->rec; switch (rec->type) { case HFS_CDR_FIL: if (!HFS_IS_RSRC(inode)) { hfs_inode_read_fork(inode, rec->file.ExtRec, rec->file.LgLen, rec->file.PyLen, be16_to_cpu(rec->file.ClpSize)); } else { hfs_inode_read_fork(inode, rec->file.RExtRec, rec->file.RLgLen, rec->file.RPyLen, be16_to_cpu(rec->file.ClpSize)); } inode->i_ino = be32_to_cpu(rec->file.FlNum); inode->i_mode = S_IRUGO | S_IXUGO; if (!(rec->file.Flags & HFS_FIL_LOCK)) inode->i_mode |= S_IWUGO; inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat)))); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; break; case HFS_CDR_DIR: inode->i_ino = be32_to_cpu(rec->dir.DirID); inode->i_size = be16_to_cpu(rec->dir.Val) + 2; HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat)))); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; default: make_bad_inode(inode); } return 0; } /* * __hfs_iget() * * Given the MDB for a HFS filesystem, a 'key' and an 'entry' in * the catalog B-tree and the 'type' of the desired file return the * inode for that file/directory or NULL. Note that 'type' indicates * whether we want the actual file or directory, or the corresponding * metadata (AppleDouble header file or CAP metadata file). */ struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec) { struct hfs_iget_data data = { key, rec }; struct inode *inode; u32 cnid; switch (rec->type) { case HFS_CDR_DIR: cnid = be32_to_cpu(rec->dir.DirID); break; case HFS_CDR_FIL: cnid = be32_to_cpu(rec->file.FlNum); break; default: return NULL; } inode = iget5_locked(sb, cnid, hfs_test_inode, hfs_read_inode, &data); if (inode && (inode->i_state & I_NEW)) unlock_new_inode(inode); return inode; } void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, __be32 *log_size, __be32 *phys_size) { memcpy(ext, HFS_I(inode)->first_extents, sizeof(hfs_extent_rec)); if (log_size) *log_size = cpu_to_be32(inode->i_size); if (phys_size) *phys_size = cpu_to_be32(HFS_I(inode)->alloc_blocks * HFS_SB(inode->i_sb)->alloc_blksz); } int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct inode *main_inode = inode; struct hfs_find_data fd; hfs_cat_rec rec; int res; hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; if (inode->i_ino < HFS_FIRSTUSER_CNID) { switch (inode->i_ino) { case HFS_ROOT_CNID: break; case HFS_EXT_CNID: hfs_btree_write(HFS_SB(inode->i_sb)->ext_tree); return 0; case HFS_CAT_CNID: hfs_btree_write(HFS_SB(inode->i_sb)->cat_tree); return 0; default: BUG(); return -EIO; } } if (HFS_IS_RSRC(inode)) main_inode = HFS_I(inode)->rsrc_inode; if (!main_inode->i_nlink) return 0; if (hfs_find_init(HFS_SB(main_inode->i_sb)->cat_tree, &fd)) /* panic? */ return -EIO; res = -EIO; if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN) goto out; fd.search_key->cat = HFS_I(main_inode)->cat_key; if (hfs_brec_find(&fd)) goto out; if (S_ISDIR(main_inode->i_mode)) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_dir)); if (rec.type != HFS_CDR_DIR || be32_to_cpu(rec.dir.DirID) != inode->i_ino) { } rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode)); rec.dir.Val = cpu_to_be16(inode->i_size - 2); hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_dir)); } else if (HFS_IS_RSRC(inode)) { if (fd.entrylength < sizeof(struct hfs_cat_file)) goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); hfs_inode_write_fork(inode, rec.file.RExtRec, &rec.file.RLgLen, &rec.file.RPyLen); hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); } else { if (fd.entrylength < sizeof(struct hfs_cat_file)) goto out; hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); if (rec.type != HFS_CDR_FIL || be32_to_cpu(rec.file.FlNum) != inode->i_ino) { } if (inode->i_mode & S_IWUSR) rec.file.Flags &= ~HFS_FIL_LOCK; else rec.file.Flags |= HFS_FIL_LOCK; hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen); rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode)); hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); } res = 0; out: hfs_find_exit(&fd); return res; } static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = NULL; hfs_cat_rec rec; struct hfs_find_data fd; int res; if (HFS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) goto out; inode = HFS_I(dir)->rsrc_inode; if (inode) goto out; inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); if (res) { iput(inode); return ERR_PTR(res); } fd.search_key->cat = HFS_I(dir)->cat_key; res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (!res) { struct hfs_iget_data idata = { NULL, &rec }; hfs_read_inode(inode, &idata); } hfs_find_exit(&fd); if (res) { iput(inode); return ERR_PTR(res); } HFS_I(inode)->rsrc_inode = dir; HFS_I(dir)->rsrc_inode = inode; igrab(dir); inode_fake_hash(inode); mark_inode_dirty(inode); dont_mount(dentry); out: return d_splice_alias(inode, dentry); } void hfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) { HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL; iput(HFS_I(inode)->rsrc_inode); } } static int hfs_file_open(struct inode *inode, struct file *file) { if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; atomic_inc(&HFS_I(inode)->opencnt); return 0; } static int hfs_file_release(struct inode *inode, struct file *file) { //struct super_block *sb = inode->i_sb; if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { inode_lock(inode); hfs_file_truncate(inode); //if (inode->i_flags & S_DEAD) { // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); // hfs_delete_inode(inode); //} inode_unlock(inode); } return 0; } /* * hfs_notify_change() * * Based very closely on fs/msdos/inode.c by Werner Almesberger * * This is the notify_change() field in the super_operations structure * for HFS file systems. The purpose is to take that changes made to * an inode and apply then in a filesystem-dependent manner. In this * case the process has a few of tasks to do: * 1) prevent changes to the i_uid and i_gid fields. * 2) map file permissions to the closest allowable permissions * 3) Since multiple Linux files can share the same on-disk inode under * HFS (for instance the data and resource forks of a file) a change * to permissions must be applied to all other in-core inodes which * correspond to the same HFS file. */ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; error = setattr_prepare(&nop_mnt_idmap, dentry, attr); /* basic permission checks */ if (error) return error; /* no uig/gid changes and limit which mode bits can be set */ if (((attr->ia_valid & ATTR_UID) && (!uid_eq(attr->ia_uid, hsb->s_uid))) || ((attr->ia_valid & ATTR_GID) && (!gid_eq(attr->ia_gid, hsb->s_gid))) || ((attr->ia_valid & ATTR_MODE) && ((S_ISDIR(inode->i_mode) && (attr->ia_mode != inode->i_mode)) || (attr->ia_mode & ~HFS_VALID_MODE_BITS)))) { return hsb->s_quiet ? 0 : error; } if (attr->ia_valid & ATTR_MODE) { /* Only the 'w' bits can ever change and only all together. */ if (attr->ia_mode & S_IWUSR) attr->ia_mode = inode->i_mode | S_IWUGO; else attr->ia_mode = inode->i_mode & ~S_IWUGO; attr->ia_mode &= S_ISDIR(inode->i_mode) ? ~hsb->s_dir_umask: ~hsb->s_file_umask; } if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; truncate_setsize(inode, attr->ia_size); hfs_file_truncate(inode); simple_inode_init_ts(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); mark_inode_dirty(inode); return 0; } static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; struct super_block * sb; int ret, err; ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; inode_lock(inode); /* sync the inode to buffers */ ret = write_inode_now(inode, 0); /* sync the superblock to buffers */ sb = inode->i_sb; flush_delayed_work(&HFS_SB(sb)->mdb_work); /* .. finally sync the buffers to disk */ err = sync_blockdev(sb->s_bdev); if (!ret) ret = err; inode_unlock(inode); return ret; } static const struct file_operations hfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = filemap_splice_read, .fsync = hfs_file_fsync, .open = hfs_file_open, .release = hfs_file_release, }; static const struct inode_operations hfs_file_inode_operations = { .lookup = hfs_file_lookup, .setattr = hfs_inode_setattr, .listxattr = generic_listxattr, }; |
51 52 42 27 52 26 26 104 1 94 1 1 1 1 1 1 1 1 1 23 10 23 23 23 23 23 3 23 23 23 23 22 22 22 7 7 134 7 131 131 50 27 29 20 41 42 41 27 129 130 131 131 104 50 131 38 131 34 108 108 108 50 50 50 50 50 50 50 50 49 32 32 32 32 32 32 32 15 11 100 99 100 100 100 55 49 16 16 50 54 54 54 32 55 55 54 24 16 23 104 104 104 104 104 104 104 104 104 104 104 104 35 1 1 35 35 34 34 34 34 2 2 2 2 2 9 8 8 9 2 2 2 2 2 2 2 2 2 9 2 2 2 2 2 2 2 2 2 2 2 2 52 52 52 52 42 42 42 42 52 52 52 52 52 52 34 34 34 52 52 51 51 51 51 38 51 51 51 50 51 51 51 16 16 16 16 16 16 16 16 16 15 56 92 43 43 1 27 85 85 83 83 82 82 82 82 82 82 4 81 85 84 81 85 64 80 79 78 80 1 79 63 62 19 5 4 18 16 19 17 2 2 2 79 67 67 79 9 3 77 15 15 78 1 5 3 3 3 63 63 63 63 4 62 62 62 60 60 62 60 62 52 1 11 11 11 11 10 1 11 33 42 13 52 45 13 2 2 1 2 41 1 1 1 12 1 4 1 1 41 40 41 41 40 40 34 40 2 2 39 39 39 8 39 3 1 33 34 22 8 11 7 11 24 33 7 7 1 1 1 1 1 5 36 14 14 14 1 13 13 13 12 12 37 23 25 24 21 24 7 2 2 2 37 35 5 35 4 4 36 6 35 39 40 34 33 41 6 6 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 1 1 1 1 7 2 7 7 7 7 6 3 3 6 6 2 6 6 6 6 2 1 2 2 2 2 7 6 7 7 7 6 6 7 7 7 2 7 7 2 7 6 6 6 6 2 2 4 8 8 8 8 6 22 22 12 12 12 12 12 12 12 12 27 27 12 24 19 19 19 17 17 17 16 14 14 15 2 12 24 24 3 24 21 12 12 2 2 24 24 22 22 4 22 3 22 22 2 2 2 2 2 2 2 22 21 21 22 22 22 22 3 22 24 21 32 21 33 33 1 33 33 33 33 33 24 24 24 24 33 1 1 33 33 10 23 23 2 19 21 3 33 33 2 33 33 33 33 33 33 32 32 33 33 31 33 30 23 12 33 33 22 33 33 32 33 33 30 33 33 33 33 14 14 33 30 33 33 33 33 33 32 33 33 33 2 33 33 33 33 33 33 33 33 32 31 29 31 31 31 3 31 20 31 2 31 31 33 33 31 31 31 30 33 31 33 33 1 33 32 32 1 33 2 31 31 30 31 33 33 6 6 4 37 19 35 14 2 22 31 35 1 35 14 12 12 10 2 24 22 22 21 7 6 3 35 33 35 35 34 3 1 3 3 3 3 3 4 4 4 4 4 4 3 3 3 3 1 4 4 20 1 41 41 41 41 26 41 41 41 4 40 39 2 39 39 24 19 19 11 11 5 5 5 41 38 39 39 23 23 39 36 39 37 33 33 33 2 39 39 38 39 7 8 8 8 1 1 1 1 8 6 6 94 94 94 93 94 89 89 17 97 96 8 7 2 7 6 6 6 6 6 5 5 6 1 1 1 1 1 4 6 5 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 | // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/mm.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blk-crypto.h> #include <linux/swap.h> #include <linux/prefetch.h> #include <linux/uio.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include <linux/iomap.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "iostat.h" #include <trace/events/f2fs.h> #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static struct kmem_cache *bio_entry_slab; static mempool_t *bio_post_read_ctx_pool; static struct bio_set f2fs_bioset; #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE int __init f2fs_init_bioset(void) { return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); } void f2fs_destroy_bioset(void) { bioset_exit(&f2fs_bioset); } bool f2fs_is_cp_guaranteed(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode; struct f2fs_sb_info *sbi; if (!mapping) return false; inode = mapping->host; sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi) || inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode)) return true; if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) || page_private_gcing(page)) return true; return false; } static enum count_type __read_io_type(struct folio *folio) { struct address_space *mapping = folio->mapping; if (mapping) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_META_INO(sbi)) return F2FS_RD_META; if (inode->i_ino == F2FS_NODE_INO(sbi)) return F2FS_RD_NODE; } return F2FS_RD_DATA; } /* postprocessing steps for read bios */ enum bio_post_read_step { #ifdef CONFIG_FS_ENCRYPTION STEP_DECRYPT = BIT(0), #else STEP_DECRYPT = 0, /* compile out the decryption-related code */ #endif #ifdef CONFIG_F2FS_FS_COMPRESSION STEP_DECOMPRESS = BIT(1), #else STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ #endif #ifdef CONFIG_FS_VERITY STEP_VERITY = BIT(2), #else STEP_VERITY = 0, /* compile out the verity-related code */ #endif }; struct bio_post_read_ctx { struct bio *bio; struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; /* * decompression_attempted keeps track of whether * f2fs_end_read_compressed_page() has been called on the pages in the * bio that belong to a compressed cluster yet. */ bool decompression_attempted; block_t fs_blkaddr; }; /* * Update and unlock a bio's pages, and free the bio. * * This marks pages up-to-date only if there was no error in the bio (I/O error, * decryption error, or verity error), as indicated by bio->bi_status. * * "Compressed pages" (pagecache pages backed by a compressed cluster on-disk) * aren't marked up-to-date here, as decompression is done on a per-compression- * cluster basis rather than a per-bio basis. Instead, we only must do two * things for each compressed page here: call f2fs_end_read_compressed_page() * with failed=true if an error occurred before it would have normally gotten * called (i.e., I/O error or decryption error, but *not* verity error), and * release the bio's reference to the decompress_io_ctx of the page's cluster. */ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct folio_iter fi; struct bio_post_read_ctx *ctx = bio->bi_private; bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; if (f2fs_is_compressed_page(&folio->page)) { if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(&folio->page, true, 0, in_task); f2fs_put_page_dic(&folio->page, in_task); continue; } dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); folio_end_read(folio, bio->bi_status == 0); } if (ctx) mempool_free(ctx, bio_post_read_ctx_pool); bio_put(bio); } static void f2fs_verify_bio(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; bool may_have_compressed_pages = (ctx->enabled_steps & STEP_DECOMPRESS); /* * fsverity_verify_bio() may call readahead() again, and while verity * will be disabled for this, decryption and/or decompression may still * be needed, resulting in another bio_post_read_ctx being allocated. * So to prevent deadlocks we need to release the current ctx to the * mempool first. This assumes that verity is the last post-read step. */ mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; /* * Verify the bio's pages with fs-verity. Exclude compressed pages, * as those were handled separately by f2fs_end_read_compressed_page(). */ if (may_have_compressed_pages) { struct bio_vec *bv; struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, bio, iter_all) { struct page *page = bv->bv_page; if (!f2fs_is_compressed_page(page) && !fsverity_verify_page(page)) { bio->bi_status = BLK_STS_IOERR; break; } } } else { fsverity_verify_bio(bio); } f2fs_finish_read_bio(bio, true); } /* * If the bio's data needs to be verified with fs-verity, then enqueue the * verity work for the bio. Otherwise finish the bio now. * * Note that to avoid deadlocks, the verity work can't be done on the * decryption/decompression workqueue. This is because verifying the data pages * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { struct bio_post_read_ctx *ctx = bio->bi_private; if (ctx && (ctx->enabled_steps & STEP_VERITY)) { INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { f2fs_finish_read_bio(bio, in_task); } } /* * Handle STEP_DECOMPRESS by decompressing any compressed clusters whose last * remaining page was read by @ctx->bio. * * Note that a bio may span clusters (even a mix of compressed and uncompressed * clusters) or be for just part of a cluster. STEP_DECOMPRESS just indicates * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; bool all_compressed = true; block_t blkaddr = ctx->fs_blkaddr; bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; if (f2fs_is_compressed_page(page)) f2fs_end_read_compressed_page(page, false, blkaddr, in_task); else all_compressed = false; blkaddr++; } ctx->decompression_attempted = true; /* * Optimization: if all the bio's pages are compressed, then scheduling * the per-bio verity work is unnecessary, as verity will be fully * handled at the compression cluster level. */ if (all_compressed) ctx->enabled_steps &= ~STEP_VERITY; } static void f2fs_post_read_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) { f2fs_finish_read_bio(bio, true); return; } if (ctx->enabled_steps & STEP_DECOMPRESS) f2fs_handle_step_decompress(ctx, true); f2fs_verify_and_finish_bio(bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; bool intask = in_task(); iostat_update_and_unbind_ctx(bio); ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) bio->bi_status = BLK_STS_IOERR; if (bio->bi_status) { f2fs_finish_read_bio(bio, intask); return; } if (ctx) { unsigned int enabled_steps = ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS); /* * If we have only decompression step between decompression and * decrypt, we don't need post processing for this. */ if (enabled_steps == STEP_DECOMPRESS && !f2fs_low_mem_mode(sbi)) { f2fs_handle_step_decompress(ctx, intask); } else if (enabled_steps) { INIT_WORK(&ctx->work, f2fs_post_read_work); queue_work(ctx->sbi->post_read_wq, &ctx->work); return; } } f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi; struct folio_iter fi; iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; bio_for_each_folio_all(fi, bio) { struct folio *folio = fi.folio; enum count_type type; if (fscrypt_is_bounce_folio(folio)) { struct folio *io_folio = folio; folio = fscrypt_pagecache_folio(io_folio); fscrypt_free_bounce_page(&io_folio->page); } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_is_compressed_page(&folio->page)) { f2fs_compress_write_end_io(bio, &folio->page); continue; } #endif type = WB_DATA_TYPE(&folio->page, false); if (unlikely(bio->bi_status)) { mapping_set_error(folio->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } f2fs_bug_on(sbi, folio->mapping == NODE_MAPPING(sbi) && folio->index != nid_of_node(&folio->page)); dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, folio)) f2fs_del_fsync_node_entry(sbi, &folio->page); clear_page_private_gcing(&folio->page); folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); } #ifdef CONFIG_BLK_DEV_ZONED static void f2fs_zone_write_end_io(struct bio *bio) { struct f2fs_bio_info *io = (struct f2fs_bio_info *)bio->bi_private; bio->bi_private = io->bi_private; complete(&io->zone_wait); f2fs_write_end_io(bio); } #endif struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, sector_t *sector) { struct block_device *bdev = sbi->sb->s_bdev; int i; if (f2fs_is_multi_device(sbi)) { for (i = 0; i < sbi->s_ndevs; i++) { if (FDEV(i).start_blk <= blk_addr && FDEV(i).end_blk >= blk_addr) { blk_addr -= FDEV(i).start_blk; bdev = FDEV(i).bdev; break; } } } if (sector) *sector = SECTOR_FROM_BLOCK(blk_addr); return bdev; } int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) { int i; if (!f2fs_is_multi_device(sbi)) return 0; for (i = 0; i < sbi->s_ndevs; i++) if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) return i; return 0; } static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; if (fio->op != REQ_OP_WRITE) return 0; if (fio->type == DATA) io_flag = fio->sbi->data_io_flag; else if (fio->type == NODE) io_flag = fio->sbi->node_io_flag; else return 0; fua_flag = io_flag & temp_mask; meta_flag = (io_flag >> NR_TEMP_TYPE) & temp_mask; /* * data/node io flag bits per temp: * REQ_META | REQ_FUA | * 5 | 4 | 3 | 2 | 1 | 0 | * Cold | Warm | Hot | Cold | Warm | Hot | */ if (BIT(fio->temp) & meta_flag) op_flags |= REQ_META; if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; if (fio->type == DATA && F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) op_flags |= REQ_PRIO; return op_flags; } static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; struct block_device *bdev; sector_t sector; struct bio *bio; bdev = f2fs_target_device(sbi, fio->new_blkaddr, §or); bio = bio_alloc_bioset(bdev, npages, fio->op | fio->op_flags | f2fs_io_flags(fio), GFP_NOIO, &f2fs_bioset); bio->bi_iter.bi_sector = sector; if (is_read_io(fio->op)) { bio->bi_end_io = f2fs_read_end_io; bio->bi_private = NULL; } else { bio->bi_end_io = f2fs_write_end_io; bio->bi_private = sbi; bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, fio->type, fio->temp); } iostat_alloc_and_bind_ctx(sbi, bio, NULL); if (fio->io_wbc) wbc_init_bio(fio->io_wbc, bio); return bio; } static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx, const struct f2fs_io_info *fio, gfp_t gfp_mask) { /* * The f2fs garbage collector sets ->encrypted_page when it wants to * read/write raw data without encryption. */ if (!fio || !fio->encrypted_page) fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); } static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, pgoff_t next_idx, const struct f2fs_io_info *fio) { /* * The f2fs garbage collector sets ->encrypted_page when it wants to * read/write raw data without encryption. */ if (fio && fio->encrypted_page) return !bio_has_crypt_ctx(bio); return fscrypt_mergeable_bio(bio, inode, next_idx); } void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { WARN_ON_ONCE(!is_read_io(bio_op(bio))); trace_f2fs_submit_read_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { WARN_ON_ONCE(is_read_io(bio_op(bio))); trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; if (!io->bio) return; if (is_read_io(fio->op)) { trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); f2fs_submit_read_bio(io->sbi, io->bio, fio->type); } else { trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); f2fs_submit_write_bio(io->sbi, io->bio, fio->type); } io->bio = NULL; } static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { struct bio_vec *bvec; struct bvec_iter_all iter_all; if (!bio) return false; if (!inode && !page && !ino) return true; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *target = bvec->bv_page; if (fscrypt_is_bounce_page(target)) { target = fscrypt_pagecache_page(target); if (IS_ERR(target)) continue; } if (f2fs_is_compressed_page(target)) { target = f2fs_compress_control_page(target); if (IS_ERR(target)) continue; } if (inode && inode == target->mapping->host) return true; if (page && page == target) return true; if (ino && ino == ino_of_node(target)) return true; } return false; } int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi) { int i; for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; sbi->write_io[i] = f2fs_kmalloc(sbi, array_size(n, sizeof(struct f2fs_bio_info)), GFP_KERNEL); if (!sbi->write_io[i]) return -ENOMEM; for (j = HOT; j < n; j++) { struct f2fs_bio_info *io = &sbi->write_io[i][j]; init_f2fs_rwsem(&io->io_rwsem); io->sbi = sbi; io->bio = NULL; io->last_block_in_bio = 0; spin_lock_init(&io->io_lock); INIT_LIST_HEAD(&io->io_list); INIT_LIST_HEAD(&io->bio_list); init_f2fs_rwsem(&io->bio_list_lock); #ifdef CONFIG_BLK_DEV_ZONED init_completion(&io->zone_wait); io->zone_pending_bio = NULL; io->bi_private = NULL; #endif } } return 0; } static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_write(&io->io_rwsem); if (!io->bio) goto unlock_out; /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->bio->bi_opf |= REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA; } __submit_merged_bio(io); unlock_out: f2fs_up_write(&io->io_rwsem); } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type, bool force) { enum temp_type temp; bool ret = true; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { if (!force) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = sbi->write_io[btype] + temp; f2fs_down_read(&io->io_rwsem); ret = __has_merged_page(io->bio, inode, page, ino); f2fs_up_read(&io->io_rwsem); } if (ret) __f2fs_submit_merged_write(sbi, type, temp); /* TODO: use HOT temp only for meta pages now. */ if (type >= META) break; } } void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { __submit_merged_write_cond(sbi, NULL, NULL, 0, type, true); } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type) { __submit_merged_write_cond(sbi, inode, page, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { f2fs_submit_merged_write(sbi, DATA); f2fs_submit_merged_write(sbi, NODE); f2fs_submit_merged_write(sbi, META); } /* * Fill the locked page with data located in the block address. * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; struct folio *fio_folio = page_folio(fio->page); struct folio *data_folio = fio->encrypted_page ? page_folio(fio->encrypted_page) : fio_folio; if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, fio->is_por ? META_POR : (__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC_ENHANCE))) return -EFSCORRUPTED; trace_f2fs_submit_folio_bio(data_folio, fio); /* Allocate a new bio */ bio = __bio_alloc(fio, 1); f2fs_set_bio_crypt_ctx(bio, fio_folio->mapping->host, fio_folio->index, fio, GFP_NOIO); bio_add_folio_nofail(bio, data_folio, folio_size(data_folio), 0); if (fio->io_wbc && !is_read_io(fio->op)) wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(data_folio) : WB_DATA_TYPE(fio->page, false)); if (is_read_io(bio_op(bio))) f2fs_submit_read_bio(fio->sbi, bio, fio->type); else f2fs_submit_write_bio(fio->sbi, bio, fio->type); return 0; } static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, block_t last_blkaddr, block_t cur_blkaddr) { if (unlikely(sbi->max_io_bytes && bio->bi_iter.bi_size >= sbi->max_io_bytes)) return false; if (last_blkaddr + 1 != cur_blkaddr) return false; return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); } static bool io_type_is_mergeable(struct f2fs_bio_info *io, struct f2fs_io_info *fio) { if (io->fio.op != fio->op) return false; return io->fio.op_flags == fio->op_flags; } static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, struct f2fs_bio_info *io, struct f2fs_io_info *fio, block_t last_blkaddr, block_t cur_blkaddr) { if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr)) return false; return io_type_is_mergeable(io, fio); } static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, struct page *page, enum temp_type temp) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); be->bio = bio; bio_get(bio); if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) f2fs_bug_on(sbi, 1); f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); f2fs_up_write(&io->bio_list_lock); } static void del_bio_entry(struct bio_entry *be) { list_del(&be->list); kmem_cache_free(bio_entry_slab, be); } static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; int ret = -EAGAIN; for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; struct bio_entry *be; f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (be->bio != *bio) continue; found = true; f2fs_bug_on(sbi, !page_is_mergeable(sbi, *bio, *fio->last_block, fio->new_blkaddr)); if (f2fs_crypt_mergeable_bio(*bio, fio->page->mapping->host, page_folio(fio->page)->index, fio) && bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { ret = 0; break; } /* page can't be merged into bio; submit the bio */ del_bio_entry(be); f2fs_submit_write_bio(sbi, *bio, DATA); break; } f2fs_up_write(&io->bio_list_lock); } if (ret) { bio_put(*bio); *bio = NULL; } return ret; } void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, struct bio **bio, struct page *page) { enum temp_type temp; bool found = false; struct bio *target = bio ? *bio : NULL; f2fs_bug_on(sbi, !target && !page); for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct list_head *head = &io->bio_list; struct bio_entry *be; if (list_empty(head)) continue; f2fs_down_read(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, page, 0); if (found) break; } f2fs_up_read(&io->bio_list_lock); if (!found) continue; found = false; f2fs_down_write(&io->bio_list_lock); list_for_each_entry(be, head, list) { if (target) found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, page, 0); if (found) { target = be->bio; del_bio_entry(be); break; } } f2fs_up_write(&io->bio_list_lock); } if (found) f2fs_submit_write_bio(sbi, target, DATA); if (bio && *bio) { bio_put(*bio); *bio = NULL; } } int f2fs_merge_page_bio(struct f2fs_io_info *fio) { struct bio *bio = *fio->bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; struct folio *folio = page_folio(fio->page); if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) return -EFSCORRUPTED; trace_f2fs_submit_folio_bio(page_folio(page), fio); if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block, fio->new_blkaddr)) f2fs_submit_merged_ipu_write(fio->sbi, &bio, NULL); alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, folio->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { if (add_ipu_page(fio, &bio, page)) goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; return 0; } #ifdef CONFIG_BLK_DEV_ZONED static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr) { struct block_device *bdev = sbi->sb->s_bdev; int devi = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkaddr); if (blkaddr < FDEV(devi).start_blk || blkaddr > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkaddr); return false; } blkaddr -= FDEV(devi).start_blk; bdev = FDEV(devi).bdev; } return bdev_is_zoned(bdev) && f2fs_blkz_is_seq(sbi, devi, blkaddr) && (blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1); } #endif void f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; enum count_type type; f2fs_bug_on(sbi, is_read_io(fio->op)); f2fs_down_write(&io->io_rwsem); next: #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) { wait_for_completion_io(&io->zone_wait); bio_put(io->zone_pending_bio); io->zone_pending_bio = NULL; io->bi_private = NULL; } #endif if (fio->in_list) { spin_lock(&io->io_lock); if (list_empty(&io->io_list)) { spin_unlock(&io->io_lock); goto out; } fio = list_first_entry(&io->io_list, struct f2fs_io_info, list); list_del(&fio->list); spin_unlock(&io->io_lock); } verify_fio_blkaddr(fio); if (fio->encrypted_page) bio_page = fio->encrypted_page; else if (fio->compressed_page) bio_page = fio->compressed_page; else bio_page = fio->page; /* set submitted = true as a return value */ fio->submitted = 1; type = WB_DATA_TYPE(bio_page, fio->compressed_page); inc_page_count(sbi, type); if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, fio->new_blkaddr) || !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host, page_folio(bio_page)->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { io->bio = __bio_alloc(fio, BIO_MAX_VECS); f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, page_folio(bio_page)->index, fio, GFP_NOIO); io->fio = *fio; } if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; trace_f2fs_submit_folio_write(page_folio(fio->page), fio); #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && btype < META && is_end_zone_blkaddr(sbi, fio->new_blkaddr)) { bio_get(io->bio); reinit_completion(&io->zone_wait); io->bi_private = io->bio->bi_private; io->bio->bi_private = io; io->bio->bi_end_io = f2fs_zone_write_end_io; io->zone_pending_bio = io->bio; __submit_merged_bio(io); } #endif if (fio->in_list) goto next; out: if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || !f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); f2fs_up_write(&io->io_rwsem); } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, blk_opf_t op_flag, pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; sector_t sector; struct block_device *bdev = f2fs_target_device(sbi, blkaddr, §or); bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), REQ_OP_READ | op_flag, for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= STEP_DECRYPT; if (f2fs_need_verity(inode, first_idx)) post_read_steps |= STEP_VERITY; /* * STEP_DECOMPRESS is handled specially, since a compressed file might * contain both compressed and uncompressed clusters. We'll allocate a * bio_post_read_ctx if the file is compressed, but the caller is * responsible for enabling STEP_DECOMPRESS if it's actually needed. */ if (post_read_steps || f2fs_compressed_file(inode)) { /* Due to the mempool, this never fails. */ ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; ctx->fs_blkaddr = blkaddr; ctx->decompression_attempted = false; bio->bi_private = ctx; } iostat_alloc_and_bind_ctx(sbi, bio, ctx); return bio; } /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, block_t blkaddr, blk_opf_t op_flags, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, folio->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, blkaddr); if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) { iostat_update_and_unbind_ctx(bio); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); return -EFAULT; } inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_submit_read_bio(sbi, bio, DATA); return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { __le32 *addr = get_dnode_addr(dn->inode, dn->node_page); dn->data_blkaddr = blkaddr; addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* * Lock ordering for the change of data block address: * ->data_page * ->node_page * update block addresses in the node page */ void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); __set_data_blkaddr(dn, blkaddr); if (set_page_dirty(dn->node_page)) dn->node_changed = true; } void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { f2fs_set_data_blkaddr(dn, blkaddr); f2fs_update_read_extent_cache(dn); } /* dn->ofs_in_node will be returned with up-to-date last block pointer */ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; err = inc_valid_block_count(sbi, dn->inode, &count, true); if (unlikely(err)) return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); if (blkaddr == NULL_ADDR) { __set_data_blkaddr(dn, NEW_ADDR); count--; } } if (set_page_dirty(dn->node_page)) dn->node_changed = true; return 0; } /* Should keep dn->ofs_in_node unchanged */ int f2fs_reserve_new_block(struct dnode_of_data *dn) { unsigned int ofs_in_node = dn->ofs_in_node; int ret; ret = f2fs_reserve_new_blocks(dn, 1); dn->ofs_in_node = ofs_in_node; return ret; } int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; int err; err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); if (err) return err; if (dn->data_blkaddr == NULL_ADDR) err = f2fs_reserve_new_block(dn); if (err || need_put) f2fs_put_dnode(dn); return err; } struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct folio *folio; int err; folio = f2fs_grab_cache_folio(mapping, index, for_write); if (IS_ERR(folio)) return folio; if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; goto put_err; } goto got_it; } set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { if (err == -ENOENT && next_pgofs) *next_pgofs = f2fs_get_next_page_offset(&dn, index); goto put_err; } f2fs_put_dnode(&dn); if (unlikely(dn.data_blkaddr == NULL_ADDR)) { err = -ENOENT; if (next_pgofs) *next_pgofs = index + 1; goto put_err; } if (dn.data_blkaddr != NEW_ADDR && !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; goto put_err; } got_it: if (folio_test_uptodate(folio)) { folio_unlock(folio); return folio; } /* * A new dentry page is allocated but not able to be written, since its * new inode page couldn't be allocated due to -ENOSPC. * In such the case, its blkaddr can be remained as NEW_ADDR. * see, f2fs_add_link -> f2fs_get_new_data_page -> * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { folio_zero_segment(folio, 0, folio_size(folio)); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); folio_unlock(folio); return folio; } err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); if (err) goto put_err; return folio; put_err: f2fs_folio_put(folio, true); return ERR_PTR(err); } struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct folio *folio; folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio)) goto read; if (folio_test_uptodate(folio)) return folio; f2fs_folio_put(folio, false); read: folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs); if (IS_ERR(folio)) return folio; if (folio_test_uptodate(folio)) return folio; folio_wait_locked(folio); if (unlikely(!folio_test_uptodate(folio))) { f2fs_folio_put(folio, false); return ERR_PTR(-EIO); } return folio; } /* * If it tries to access a hole, return an error. * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; struct folio *folio; folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL); if (IS_ERR(folio)) return folio; /* wait for read completion */ folio_lock(folio); if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) { f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } return folio; } /* * Caller ensures that this data page is never allocated. * A new zero-filled data page is allocated in the page cache. * * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). * Note that, ipage is set only by make_empty_dir, and if any error occur, * ipage should be released by this function. */ struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { struct address_space *mapping = inode->i_mapping; struct page *page; struct dnode_of_data dn; int err; page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* * before exiting, we should make sure ipage will be released * if any error occur. */ f2fs_put_page(ipage, 1); return ERR_PTR(-ENOMEM); } set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); if (err) { f2fs_put_page(page, 1); return ERR_PTR(err); } if (!ipage) f2fs_put_dnode(&dn); if (PageUptodate(page)) goto got_it; if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); } else { f2fs_put_page(page, 1); /* if ipage exists, blkaddr should be NEW_ADDR */ f2fs_bug_on(F2FS_I_SB(inode), ipage); page = f2fs_get_lock_data_page(inode, index, true); if (IS_ERR(page)) return page; } got_it: if (new_i_size && i_size_read(inode) < ((loff_t)(index + 1) << PAGE_SHIFT)) f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; block_t old_blkaddr; blkcnt_t count = 1; int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr == NULL_ADDR) { err = inc_valid_block_count(sbi, dn->inode, &count, true); if (unlikely(err)) return err; } set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); old_blkaddr = dn->data_blkaddr; err = f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL); if (err) return err; if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1); f2fs_update_data_blkaddr(dn, dn->data_blkaddr); return 0; } static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_down_read(&sbi->node_change); else f2fs_lock_op(sbi); } static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) { if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_up_read(&sbi->node_change); else f2fs_unlock_op(sbi); } int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); int err = 0; f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); if (!f2fs_lookup_read_extent_cache_block(dn->inode, index, &dn->data_blkaddr)) err = f2fs_reserve_block(dn, index); f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } static int f2fs_map_no_dnode(struct inode *inode, struct f2fs_map_blocks *map, struct dnode_of_data *dn, pgoff_t pgoff) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); /* * There is one exceptional case that read_node_page() may return * -ENOENT due to filesystem has been shutdown or cp_error, return * -EIO in that case. */ if (map->m_may_create && (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || f2fs_cp_error(sbi))) return -EIO; if (map->m_next_pgofs) *map->m_next_pgofs = f2fs_get_next_page_offset(dn, pgoff); if (map->m_next_extent) *map->m_next_extent = f2fs_get_next_page_offset(dn, pgoff); return 0; } static bool f2fs_map_blocks_cached(struct inode *inode, struct f2fs_map_blocks *map, int flag) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int maxblocks = map->m_len; pgoff_t pgoff = (pgoff_t)map->m_lblk; struct extent_info ei = {}; if (!f2fs_lookup_read_extent_cache(inode, pgoff, &ei)) return false; map->m_pblk = ei.blk + pgoff - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgoff); map->m_flags = F2FS_MAP_MAPPED; if (map->m_next_extent) *map->m_next_extent = pgoff + map->m_len; /* for hardware encryption, but to avoid potential issue in future */ if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); if (f2fs_allow_multi_device_dio(sbi, flag)) { int bidx = f2fs_target_device_index(sbi, map->m_pblk); struct f2fs_dev_info *dev = &sbi->devs[bidx]; map->m_bdev = dev->bdev; map->m_pblk -= dev->start_blk; map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); } else { map->m_bdev = inode->i_sb->s_bdev; } return true; } static bool map_is_mergeable(struct f2fs_sb_info *sbi, struct f2fs_map_blocks *map, block_t blkaddr, int flag, int bidx, int ofs) { if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) return false; if (map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) return true; if (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) return true; if (flag == F2FS_GET_BLOCK_PRE_DIO) return true; if (flag == F2FS_GET_BLOCK_DIO && map->m_pblk == NULL_ADDR && blkaddr == NULL_ADDR) return true; return false; } /* * f2fs_map_blocks() tries to find or build mapping relationship which * maps continuous logical blocks to physical blocks, and return such * info via f2fs_map_blocks structure. */ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE; pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; block_t blkaddr; unsigned int start_pgofs; int bidx = 0; bool is_hole; if (!maxblocks) return 0; if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag)) goto out; map->m_bdev = inode->i_sb->s_bdev; map->m_multidev_dio = f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); map->m_len = 0; map->m_flags = 0; /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; next_dnode: if (map->m_may_create) f2fs_map_lock(sbi, flag); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, pgofs, mode); if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; if (err == -ENOENT) err = f2fs_map_no_dnode(inode, map, &dn, pgofs); goto unlock_out; } start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); is_hole = !__is_valid_data_blkaddr(blkaddr); if (!is_hole && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; goto sync_out; } /* use out-place-update for direct IO under LFS mode */ if (map->m_may_create && (is_hole || (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && !f2fs_is_pinned_file(inode)))) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto sync_out; } switch (flag) { case F2FS_GET_BLOCK_PRE_AIO: if (blkaddr == NULL_ADDR) { prealloc++; last_ofs_in_node = dn.ofs_in_node; } break; case F2FS_GET_BLOCK_PRE_DIO: case F2FS_GET_BLOCK_DIO: err = __allocate_data_block(&dn, map->m_seg_type); if (err) goto sync_out; if (flag == F2FS_GET_BLOCK_PRE_DIO) file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); break; default: WARN_ON_ONCE(1); err = -EIO; goto sync_out; } blkaddr = dn.data_blkaddr; if (is_hole) map->m_flags |= F2FS_MAP_NEW; } else if (is_hole) { if (f2fs_compressed_file(inode) && f2fs_sanity_check_cluster(&dn)) { err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_CLUSTER); goto sync_out; } switch (flag) { case F2FS_GET_BLOCK_PRECACHE: goto sync_out; case F2FS_GET_BLOCK_BMAP: map->m_pblk = 0; goto sync_out; case F2FS_GET_BLOCK_FIEMAP: if (blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } break; case F2FS_GET_BLOCK_DIO: if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; break; default: /* for defragment case */ if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; goto sync_out; } } if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; if (map->m_multidev_dio) bidx = f2fs_target_device_index(sbi, blkaddr); if (map->m_len == 0) { /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) map->m_flags |= F2FS_MAP_DELALLOC; /* DIO READ and hole case, should not map the blocks. */ if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create)) map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; map->m_len = 1; if (map->m_multidev_dio) map->m_bdev = FDEV(bidx).bdev; } else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) { ofs++; map->m_len++; } else { goto sync_out; } skip: dn.ofs_in_node++; pgofs++; /* preallocate blocks in batch for one dnode page */ if (flag == F2FS_GET_BLOCK_PRE_AIO && (pgofs == end || dn.ofs_in_node == end_offset)) { dn.ofs_in_node = ofs_in_node; err = f2fs_reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { err = -ENOSPC; goto sync_out; } dn.ofs_in_node = end_offset; } if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) && map->m_may_create) { /* the next block to be allocated may not be contiguous. */ if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) == CAP_BLKS_PER_SEC(sbi) - 1) goto sync_out; } if (pgofs >= end) goto sync_out; else if (dn.ofs_in_node < end_offset) goto next_block; if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } } f2fs_put_dnode(&dn); if (map->m_may_create) { f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; sync_out: if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { /* * for hardware encryption, but to avoid potential issue * in future */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); if (map->m_multidev_dio) { block_t blk_addr = map->m_pblk; bidx = f2fs_target_device_index(sbi, map->m_pblk); map->m_bdev = FDEV(bidx).bdev; map->m_pblk -= FDEV(bidx).start_blk; if (map->m_may_create) f2fs_update_device_state(sbi, inode->i_ino, blk_addr, map->m_len); f2fs_bug_on(sbi, blk_addr + map->m_len > FDEV(bidx).end_blk + 1); } } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; f2fs_update_read_extent_cache_range(&dn, start_pgofs, map->m_pblk + ofs, map->m_len - ofs); } if (map->m_next_extent) *map->m_next_extent = pgofs + 1; } f2fs_put_dnode(&dn); unlock_out: if (map->m_may_create) { f2fs_map_unlock(sbi, flag); f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, flag, err); return err; } bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) { struct f2fs_map_blocks map; block_t last_lblk; int err; if (pos + len > i_size_read(inode)) return false; map.m_lblk = F2FS_BYTES_TO_BLK(pos); map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; last_lblk = F2FS_BLK_ALIGN(pos + len); while (map.m_lblk < last_lblk) { map.m_len = last_lblk - map.m_lblk; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT); if (err || map.m_len == 0) return false; map.m_lblk += map.m_len; } return true; } static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; struct node_info ni; __u64 phys = 0, len; __u32 flags; nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; if (f2fs_has_inline_xattr(inode)) { int offset; page = f2fs_grab_cache_page(NODE_MAPPING(sbi), inode->i_ino, false); if (!page) return -ENOMEM; err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } phys = F2FS_BLK_TO_BYTES(ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); phys += offset; len = inline_xattr_size(inode); f2fs_put_page(page, 1); flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; if (!xnid) flags |= FIEMAP_EXTENT_LAST; err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); if (err) return err; } if (xnid) { page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); if (!page) return -ENOMEM; err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; } phys = F2FS_BLK_TO_BYTES(ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); flags = FIEMAP_EXTENT_LAST; } if (phys) { err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags); trace_f2fs_fiemap(inode, 0, phys, len, flags, err); } return (err < 0 ? err : 0); } int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct f2fs_map_blocks map; sector_t start_blk, last_blk, blk_len, max_len; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; bool compr_cluster = false, compr_appended; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; unsigned int count_in_cluster = 0; loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); if (ret) return ret; } ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); if (ret) return ret; inode_lock_shared(inode); maxbytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); if (start > maxbytes) { ret = -EFBIG; goto out; } if (len > maxbytes || (maxbytes - len) < start) len = maxbytes - start; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; } if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len); if (ret != -EAGAIN) goto out; } start_blk = F2FS_BYTES_TO_BLK(start); last_blk = F2FS_BYTES_TO_BLK(start + len - 1); blk_len = last_blk - start_blk + 1; max_len = F2FS_BYTES_TO_BLK(maxbytes) - start_blk; next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; map.m_len = blk_len; map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; if (compr_cluster) { map.m_lblk += 1; map.m_len = cluster_size - count_in_cluster; } ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; if (F2FS_BLK_TO_BYTES(start_blk) < maxbytes) goto prep_next; flags |= FIEMAP_EXTENT_LAST; } /* * current extent may cross boundary of inquiry, increase len to * requery. */ if (!compr_cluster && (map.m_flags & F2FS_MAP_MAPPED) && map.m_lblk + map.m_len - 1 == last_blk && blk_len != max_len) { blk_len = max_len; goto next; } compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || !(map.m_flags & F2FS_MAP_FLAGS))) { compr_appended = true; goto skip_fill; } if (size) { flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); trace_f2fs_fiemap(inode, logical, phys, size, flags, ret); if (ret) goto out; size = 0; } if (start_blk > last_blk) goto out; skip_fill: if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; count_in_cluster = 1; } else if (compr_appended) { unsigned int appended_blks = cluster_size - count_in_cluster + 1; size += F2FS_BLK_TO_BYTES(appended_blks); start_blk += appended_blks; compr_cluster = false; } else { logical = F2FS_BLK_TO_BYTES(start_blk); phys = __is_valid_data_blkaddr(map.m_pblk) ? F2FS_BLK_TO_BYTES(map.m_pblk) : 0; size = F2FS_BLK_TO_BYTES(map.m_len); flags = 0; if (compr_cluster) { flags = FIEMAP_EXTENT_ENCODED; count_in_cluster += map.m_len; if (count_in_cluster == cluster_size) { compr_cluster = false; size += F2FS_BLKSIZE; } } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } start_blk += F2FS_BYTES_TO_BLK(size); } prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; else goto next; out: if (ret == 1) ret = 0; inode_unlock_shared(inode); return ret; } static inline loff_t f2fs_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return F2FS_BLK_TO_BYTES(max_file_blocks(inode)); return i_size_read(inode); } static inline blk_opf_t f2fs_ra_op_flags(struct readahead_control *rac) { return rac ? REQ_RAHEAD : 0; } static int f2fs_read_single_page(struct inode *inode, struct folio *folio, unsigned nr_pages, struct f2fs_map_blocks *map, struct bio **bio_ret, sector_t *last_block_in_bio, struct readahead_control *rac) { struct bio *bio = *bio_ret; const unsigned int blocksize = F2FS_BLKSIZE; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t block_nr; pgoff_t index = folio_index(folio); int ret = 0; block_in_file = (sector_t)index; last_block = block_in_file + nr_pages; last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; /* just zeroing out page which is beyond EOF */ if (block_in_file >= last_block) goto zero_out; /* * Map blocks using the previous result first. */ if ((map->m_flags & F2FS_MAP_MAPPED) && block_in_file > map->m_lblk && block_in_file < (map->m_lblk + map->m_len)) goto got_it; /* * Then do more f2fs_map_blocks() calls until we are * done with this page. */ map->m_lblk = block_in_file; map->m_len = last_block - block_in_file; ret = f2fs_map_blocks(inode, map, F2FS_GET_BLOCK_DEFAULT); if (ret) goto out; got_it: if ((map->m_flags & F2FS_MAP_MAPPED)) { block_nr = map->m_pblk + block_in_file - map->m_lblk; folio_set_mappedtodisk(folio); if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; goto out; } } else { zero_out: folio_zero_segment(folio, 0, folio_size(folio)); if (f2fs_need_verity(inode, index) && !fsverity_verify_folio(folio)) { ret = -EIO; goto out; } if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); folio_unlock(folio); goto out; } /* * This page will go to BIO. Do we need to send this * BIO off first? */ if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio, *last_block_in_bio, block_nr) || !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) { submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, f2fs_ra_op_flags(rac), index, false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; goto out; } } /* * If the page is under writeback, we need to wait for * its completion to see the correct decrypted data. */ f2fs_wait_on_block_writeback(inode, block_nr); if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = block_nr; out: *bio_ret = bio; return ret; } #ifdef CONFIG_F2FS_FS_COMPRESSION int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, struct readahead_control *rac, bool for_write) { struct dnode_of_data dn; struct inode *inode = cc->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; const unsigned int blocksize = F2FS_BLKSIZE; struct decompress_io_ctx *dic = NULL; struct extent_info ei = {}; bool from_dnode = true; int i; int ret = 0; if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; from_dnode = false; goto out_put_dnode; } f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; struct folio *folio; if (!page) continue; folio = page_folio(page); if ((sector_t)folio->index >= last_block_in_file) { folio_zero_segment(folio, 0, folio_size(folio)); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); } else if (!folio_test_uptodate(folio)) { continue; } folio_unlock(folio); if (for_write) folio_put(folio); cc->rpages[i] = NULL; cc->nr_rpages--; } /* we are done since all pages are beyond EOF */ if (f2fs_cluster_is_empty(cc)) goto out; if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei)) from_dnode = false; if (!from_dnode) goto skip_reading_dnode; set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) goto out; f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) : ei.blk + i - 1; if (!__is_valid_data_blkaddr(blkaddr)) break; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { ret = -EFAULT; goto out_put_dnode; } cc->nr_cpages++; if (!from_dnode && i >= ei.c_len) break; } /* nothing to decompress */ if (cc->nr_cpages == 0) { ret = 0; goto out_put_dnode; } dic = f2fs_alloc_dic(cc); if (IS_ERR(dic)) { ret = PTR_ERR(dic); goto out_put_dnode; } for (i = 0; i < cc->nr_cpages; i++) { struct folio *folio = page_folio(dic->cpages[i]); block_t blkaddr; struct bio_post_read_ctx *ctx; blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1) : ei.blk + i; f2fs_wait_on_block_writeback(inode, blkaddr); if (f2fs_load_compressed_page(sbi, folio_page(folio, 0), blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) { f2fs_decompress_cluster(dic, true); break; } continue; } if (bio && (!page_is_mergeable(sbi, bio, *last_block_in_bio, blkaddr) || !f2fs_crypt_mergeable_bio(bio, inode, folio->index, NULL))) { submit_and_realloc: f2fs_submit_read_bio(sbi, bio, DATA); bio = NULL; } if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, f2fs_ra_op_flags(rac), folio->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; } } if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; ctx = get_post_read_ctx(bio); ctx->enabled_steps |= STEP_DECOMPRESS; refcount_inc(&dic->refcnt); inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); *last_block_in_bio = blkaddr; } if (from_dnode) f2fs_put_dnode(&dn); *bio_ret = bio; return 0; out_put_dnode: if (from_dnode) f2fs_put_dnode(&dn); out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { ClearPageUptodate(cc->rpages[i]); unlock_page(cc->rpages[i]); } } *bio_ret = bio; return ret; } #endif /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. */ static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = NULL_CLUSTER, .rpages = NULL, .cpages = NULL, .nr_rpages = 0, .nr_cpages = 0, }; pgoff_t nc_cluster_idx = NULL_CLUSTER; pgoff_t index; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; for (; nr_pages; nr_pages--) { if (rac) { folio = readahead_folio(rac); prefetchw(&folio->flags); } #ifdef CONFIG_F2FS_FS_COMPRESSION index = folio_index(folio); if (!f2fs_compressed_file(inode)) goto read_single_page; /* there are remained compressed pages, submit them */ if (!f2fs_cluster_can_merge_page(&cc, index)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, rac, false); f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } if (cc.cluster_idx == NULL_CLUSTER) { if (nc_cluster_idx == index >> cc.log_cluster_size) goto read_single_page; ret = f2fs_is_compressed_cluster(inode, index); if (ret < 0) goto set_error_page; else if (!ret) { nc_cluster_idx = index >> cc.log_cluster_size; goto read_single_page; } nc_cluster_idx = NULL_CLUSTER; } ret = f2fs_init_compress_ctx(&cc); if (ret) goto set_error_page; f2fs_compress_ctx_add_page(&cc, folio); goto next_page; read_single_page: #endif ret = f2fs_read_single_page(inode, folio, max_nr_pages, &map, &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: #endif folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION next_page: #endif #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { /* last page */ if (nr_pages == 1 && !f2fs_cluster_is_empty(&cc)) { ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, rac, false); f2fs_destroy_compress_ctx(&cc, false); } } #endif } if (bio) f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); return ret; } static int f2fs_read_data_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; int ret = -EAGAIN; trace_f2fs_readpage(folio, DATA); if (!f2fs_is_compress_backend_ready(inode)) { folio_unlock(folio); return -EOPNOTSUPP; } /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, folio); if (ret == -EAGAIN) ret = f2fs_mpage_readpages(inode, NULL, folio); return ret; } static void f2fs_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) return; /* If the file has inline data, skip readahead */ if (f2fs_has_inline_data(inode)) return; f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; struct page *mpage, *page; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) return 0; page = fio->compressed_page ? fio->compressed_page : fio->page; if (fscrypt_inode_uses_inline_crypto(inode)) return 0; retry_encrypt: fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page), PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } return PTR_ERR(fio->encrypted_page); } mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); if (mpage) { if (PageUptodate(mpage)) memcpy(page_address(mpage), page_address(fio->encrypted_page), PAGE_SIZE); f2fs_put_page(mpage, 1); } return 0; } static inline bool check_inplace_update_policy(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (IS_F2FS_IPU_HONOR_OPU_WRITE(sbi) && is_inode_flag_set(inode, FI_OPU_WRITE)) return false; if (IS_F2FS_IPU_FORCE(sbi)) return true; if (IS_F2FS_IPU_SSR(sbi) && f2fs_need_SSR(sbi)) return true; if (IS_F2FS_IPU_UTIL(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; if (IS_F2FS_IPU_SSR_UTIL(sbi) && f2fs_need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; /* * IPU for rewrite async pages */ if (IS_F2FS_IPU_ASYNC(sbi) && fio && fio->op == REQ_OP_WRITE && !(fio->op_flags & REQ_SYNC) && !IS_ENCRYPTED(inode)) return true; /* this is only set during fdatasync */ if (IS_F2FS_IPU_FSYNC(sbi) && is_inode_flag_set(inode, FI_NEED_IPU)) return true; if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) return true; return false; } bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio) { /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return false; if (f2fs_is_pinned_file(inode)) return true; /* if this is cold file, we should overwrite to avoid fragmentation */ if (file_is_cold(inode) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return true; return check_inplace_update_policy(inode, fio); } bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); /* The below cases were checked when setting it. */ if (f2fs_is_pinned_file(inode)) return false; if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return true; if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) return true; if (IS_NOQUOTA(inode)) return true; if (f2fs_used_in_atomic_write(inode)) return true; /* rewrite low ratio compress data w/ OPU mode to avoid fragmentation */ if (f2fs_compressed_file(inode) && F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER && is_inode_flag_set(inode, FI_ENABLE_COMPRESS)) return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return true; if (is_inode_flag_set(inode, FI_OPU_WRITE)) return true; if (fio) { if (page_private_gcing(fio->page)) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) return true; } return false; } static inline bool need_inplace_update(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; if (f2fs_should_update_outplace(inode, fio)) return false; return f2fs_should_update_inplace(inode, fio); } int f2fs_do_write_data_page(struct f2fs_io_info *fio) { struct folio *folio = page_folio(fio->page); struct inode *inode = folio->mapping->host; struct dnode_of_data dn; struct node_info ni; bool ipu_force = false; bool atomic_commit; int err = 0; /* Use COW inode to make dnode_of_data for atomic write */ atomic_commit = f2fs_is_atomic_file(inode) && page_private_atomic(folio_page(folio, 0)); if (atomic_commit) set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0); else set_new_dnode(&dn, inode, NULL, NULL, 0); if (need_inplace_update(fio) && f2fs_lookup_read_extent_cache_block(inode, folio->index, &fio->old_blkaddr)) { if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) return -EFSCORRUPTED; ipu_force = true; fio->need_lock = LOCK_DONE; goto got_it; } /* Deadlock due to between page->lock and f2fs_lock_op */ if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) return -EAGAIN; err = f2fs_get_dnode_of_data(&dn, folio->index, LOOKUP_NODE); if (err) goto out; fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { folio_clear_uptodate(folio); clear_page_private_gcing(folio_page(folio, 0)); goto out_writepage; } got_it: if (__is_valid_data_blkaddr(fio->old_blkaddr) && !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, DATA_GENERIC_ENHANCE)) { err = -EFSCORRUPTED; goto out_writepage; } /* wait for GCed page writeback via META_MAPPING */ if (fio->meta_gc) f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ if (ipu_force || (__is_valid_data_blkaddr(fio->old_blkaddr) && need_inplace_update(fio))) { err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; folio_start_writeback(folio); f2fs_put_dnode(&dn); if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); if (err) { if (fscrypt_inode_uses_fs_layer_crypto(inode)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); } else { set_inode_flag(inode, FI_UPDATE_WRITE); } trace_f2fs_do_write_data_page(folio, IPU); return err; } if (fio->need_lock == LOCK_RETRY) { if (!f2fs_trylock_op(fio->sbi)) { err = -EAGAIN; goto out_writepage; } fio->need_lock = LOCK_REQ; } err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; fio->version = ni.version; err = f2fs_encrypt_one_page(fio); if (err) goto out_writepage; folio_start_writeback(folio); if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR) f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false); /* LFS mode write path */ f2fs_outplace_write_data(&dn, fio); trace_f2fs_do_write_data_page(folio, OPU); set_inode_flag(inode, FI_APPEND_WRITE); if (atomic_commit) clear_page_private_atomic(folio_page(folio, 0)); out_writepage: f2fs_put_dnode(&dn); out: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); return err; } int f2fs_write_single_data_page(struct folio *folio, int *submitted, struct bio **bio, sector_t *last_block, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance) { struct inode *inode = folio->mapping->host; struct page *page = folio_page(folio, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long)i_size) >> PAGE_SHIFT; loff_t psize = (loff_t)(folio->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; bool quota_inode = IS_NOQUOTA(inode); int err = 0; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = 0, .compr_blocks = compr_blocks, .need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY, .meta_gc = f2fs_meta_inode_gc_required(inode) ? 1 : 0, .io_type = io_type, .io_wbc = wbc, .bio = bio, .last_block = last_block, }; trace_f2fs_writepage(folio, DATA); /* we should bypass data pages to proceed the kworker jobs */ if (unlikely(f2fs_cp_error(sbi))) { mapping_set_error(folio->mapping, -EIO); /* * don't drop any dirty dentry pages for keeping lastest * directory structure. */ if (S_ISDIR(inode->i_mode) && !is_sbi_flag_set(sbi, SBI_IS_CLOSE)) goto redirty_out; /* keep data pages in remount-ro mode */ if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_READONLY) goto redirty_out; goto out; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (folio->index < end_index || f2fs_verity_in_progress(inode) || compr_blocks) goto write; /* * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ offset = i_size & (PAGE_SIZE - 1); if ((folio->index >= end_index + 1) || !offset) goto out; folio_zero_segment(folio, offset, folio_size(folio)); write: /* Dentry/quota blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode) || quota_inode) { /* * We need to wait for node_write to avoid block allocation during * checkpoint. This can only happen to quota writes which can cause * the below discard race condition. */ if (quota_inode) f2fs_down_read(&sbi->node_write); fio.need_lock = LOCK_DONE; err = f2fs_do_write_data_page(&fio); if (quota_inode) f2fs_up_read(&sbi->node_write); goto done; } if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; else set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, folio); if (!err) goto out; } if (err == -EAGAIN) { err = f2fs_do_write_data_page(&fio); if (err == -EAGAIN) { f2fs_bug_on(sbi, compr_blocks); fio.need_lock = LOCK_REQ; err = f2fs_do_write_data_page(&fio); } } if (err) { file_set_keep_isize(inode); } else { spin_lock(&F2FS_I(inode)->i_size_lock); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; spin_unlock(&F2FS_I(inode)->i_size_lock); } done: if (err && err != -ENOENT) goto redirty_out; out: inode_dec_dirty_pages(inode); if (err) { folio_clear_uptodate(folio); clear_page_private_gcing(page); } if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); clear_inode_flag(inode, FI_HOT_DATA); f2fs_remove_dirty_inode(inode); submitted = NULL; } folio_unlock(folio); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && !F2FS_I(inode)->wb_task && allow_balance) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_write(sbi, DATA); if (bio && *bio) f2fs_submit_merged_ipu_write(sbi, bio, NULL); submitted = NULL; } if (submitted) *submitted = fio.submitted; return 0; redirty_out: folio_redirty_for_writepage(wbc, folio); /* * pageout() in MM translates EAGAIN, so calls handle_write_error() * -> mapping_set_error() -> set_bit(AS_EIO, ...). * file_write_and_wait_range() will see EIO error, which is critical * to return value of fsync() followed by atomic_write failure to user. */ if (!err || wbc->for_reclaim) return AOP_WRITEPAGE_ACTIVATE; folio_unlock(folio); return err; } /* * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { int ret = 0; int done = 0, retry = 0; struct page *pages_local[F2FS_ONSTACK_PAGES]; struct page **pages = pages_local; struct folio_batch fbatch; struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); struct bio *bio = NULL; sector_t last_block; #ifdef CONFIG_F2FS_FS_COMPRESSION struct inode *inode = mapping->host; struct compress_ctx cc = { .inode = inode, .log_cluster_size = F2FS_I(inode)->i_log_cluster_size, .cluster_size = F2FS_I(inode)->i_cluster_size, .cluster_idx = NULL_CLUSTER, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, .private = NULL, }; #endif int nr_folios, p, idx; int nr_pages; unsigned int max_pages = F2FS_ONSTACK_PAGES; pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; int range_whole = 0; xa_mark_t tag; int nwritten = 0; int submitted = 0; int i; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode) && 1 << cc.log_cluster_size > F2FS_ONSTACK_PAGES) { pages = f2fs_kzalloc(sbi, sizeof(struct page *) << cc.log_cluster_size, GFP_NOFS | __GFP_NOFAIL); max_pages = 1 << cc.log_cluster_size; } #endif folio_batch_init(&fbatch); if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) set_inode_flag(mapping->host, FI_HOT_DATA); else clear_inode_flag(mapping->host, FI_HOT_DATA); if (wbc->range_cyclic) { index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: retry = 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !retry && (index <= end)) { nr_pages = 0; again: nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) { if (nr_pages) goto write; break; } for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; idx = 0; p = folio_nr_pages(folio); add_more: pages[nr_pages] = folio_page(folio, idx); folio_get(folio); if (++nr_pages == max_pages) { index = folio->index + idx + 1; folio_batch_release(&fbatch); goto write; } if (++idx < p) goto add_more; } folio_batch_release(&fbatch); goto again; write: for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; struct folio *folio = page_folio(page); bool need_readd; readd: need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { void *fsdata = NULL; struct page *pagep; int ret2; ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; break; } if (!f2fs_cluster_can_merge_page(&cc, folio->index)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); if (!ret) need_readd = true; goto result; } if (unlikely(f2fs_cp_error(sbi))) goto lock_folio; if (!f2fs_cluster_is_empty(&cc)) goto lock_folio; if (f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, true)) goto lock_folio; ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, folio->index, &fsdata); if (ret2 < 0) { ret = ret2; done = 1; break; } else if (ret2 && (!f2fs_compress_write_end(inode, fsdata, folio->index, 1) || !f2fs_all_cluster_page_ready(&cc, pages, i, nr_pages, false))) { retry = 1; break; } } #endif /* give a priority to WB_SYNC threads */ if (atomic_read(&sbi->wb_sync_req[DATA]) && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } #ifdef CONFIG_F2FS_FS_COMPRESSION lock_folio: #endif done_index = folio->index; retry_write: folio_lock(folio); if (unlikely(folio->mapping != mapping)) { continue_unlock: folio_unlock(folio); continue; } if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } if (folio_test_writeback(folio)) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; f2fs_wait_on_page_writeback(&folio->page, DATA, true, true); } if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { folio_get(folio); f2fs_compress_ctx_add_page(&cc, folio); continue; } #endif submitted = 0; ret = f2fs_write_single_data_page(folio, &submitted, &bio, &last_block, wbc, io_type, 0, true); if (ret == AOP_WRITEPAGE_ACTIVATE) folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif nwritten += submitted; wbc->nr_to_write -= submitted; if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to * get # of written pages. */ if (ret == AOP_WRITEPAGE_ACTIVATE) { ret = 0; goto next; } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { f2fs_io_schedule_timeout( DEFAULT_IO_TIMEOUT); goto retry_write; } goto next; } done_index = folio_next_index(folio); done = 1; break; } if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } next: if (need_readd) goto readd; } release_pages(pages, nr_pages); cond_resched(); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* flush remained pages in compress cluster */ if (f2fs_compressed_file(inode) && !f2fs_cluster_is_empty(&cc)) { ret = f2fs_write_multi_pages(&cc, &submitted, wbc, io_type); nwritten += submitted; wbc->nr_to_write -= submitted; if (ret) { done = 1; retry = 0; } } if (f2fs_compressed_file(inode)) f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; end = -1; goto retry; } if (wbc->range_cyclic && !done) done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; if (nwritten) f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, NULL, 0, DATA); /* submit cached bio of IPU write */ if (bio) f2fs_submit_merged_ipu_write(sbi, &bio, NULL); #ifdef CONFIG_F2FS_FS_COMPRESSION if (pages != pages_local) kfree(pages); #endif return ret; } static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { /* to avoid deadlock in path of data flush */ if (F2FS_I(inode)->wb_task) return false; if (!S_ISREG(inode->i_mode)) return false; if (IS_NOQUOTA(inode)) return false; if (f2fs_need_compress_data(inode)) return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) return true; return false; } static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct blk_plug plug; int ret; bool locked = false; /* skip writing if there is no dirty page in this inode */ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; /* skip writing in file defragment preparing stage */ if (is_inode_flag_set(inode, FI_SKIP_WRITES)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, DATA); /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ if (wbc->sync_mode == WB_SYNC_ALL) atomic_inc(&sbi->wb_sync_req[DATA]); else if (atomic_read(&sbi->wb_sync_req[DATA])) { /* to avoid potential deadlock */ if (current->plug) blk_finish_plug(current->plug); goto skip_write; } if (__should_serialize_io(inode, wbc)) { mutex_lock(&sbi->writepages); locked = true; } blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (locked) mutex_unlock(&sbi->writepages); if (wbc->sync_mode == WB_SYNC_ALL) atomic_dec(&sbi->wb_sync_req[DATA]); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. */ f2fs_remove_dirty_inode(inode); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; return __f2fs_write_data_pages(mapping, wbc, F2FS_I(inode)->cp_task == current ? FS_CP_DATA_IO : FS_DATA_IO); } void f2fs_write_failed(struct inode *inode, loff_t to) { loff_t i_size = i_size_read(inode); if (IS_NOQUOTA(inode)) return; /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } static int prepare_write_begin(struct f2fs_sb_info *sbi, struct folio *folio, loff_t pos, unsigned int len, block_t *blk_addr, bool *node_changed) { struct inode *inode = folio->mapping->host; pgoff_t index = folio->index; struct dnode_of_data dn; struct page *ipage; bool locked = false; int flag = F2FS_GET_BLOCK_PRE_AIO; int err = 0; /* * If a whole page is being written and we already preallocated all the * blocks, then there is no need to get a block address now. */ if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ if (f2fs_has_inline_data(inode)) { if (pos + len > MAX_INLINE_DATA(inode)) flag = F2FS_GET_BLOCK_DEFAULT; f2fs_map_lock(sbi, flag); locked = true; } else if ((pos & PAGE_MASK) >= i_size_read(inode)) { f2fs_map_lock(sbi, flag); locked = true; } restart: /* check inline_data */ ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; } set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { f2fs_do_read_inline_data(folio, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) set_page_private_inline(ipage); goto out; } err = f2fs_convert_inline_page(&dn, folio_page(folio, 0)); if (err || dn.data_blkaddr != NULL_ADDR) goto out; } if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (IS_DEVICE_ALIASING(inode)) { err = -ENODATA; goto out; } if (locked) { err = f2fs_reserve_block(&dn, index); goto out; } /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (!err && dn.data_blkaddr != NULL_ADDR) goto out; f2fs_put_dnode(&dn); f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; } out: if (!err) { /* convert_inline_page can make node_changed */ *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; } f2fs_put_dnode(&dn); unlock_out: if (locked) f2fs_map_unlock(sbi, flag); return err; } static int __find_data_block(struct inode *inode, pgoff_t index, block_t *blk_addr) { struct dnode_of_data dn; struct page *ipage; int err = 0; ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); set_new_dnode(&dn, inode, ipage, ipage, 0); if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { /* hole case */ err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { dn.data_blkaddr = NULL_ADDR; err = 0; } } *blk_addr = dn.data_blkaddr; f2fs_put_dnode(&dn); return err; } static int __reserve_data_block(struct inode *inode, pgoff_t index, block_t *blk_addr, bool *node_changed) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; struct page *ipage; int err = 0; f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; } set_new_dnode(&dn, inode, ipage, ipage, 0); if (!f2fs_lookup_read_extent_cache_block(dn.inode, index, &dn.data_blkaddr)) err = f2fs_reserve_block(&dn, index); *blk_addr = dn.data_blkaddr; *node_changed = dn.node_changed; f2fs_put_dnode(&dn); unlock_out: f2fs_map_unlock(sbi, F2FS_GET_BLOCK_PRE_AIO); return err; } static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi, struct folio *folio, loff_t pos, unsigned int len, block_t *blk_addr, bool *node_changed, bool *use_cow) { struct inode *inode = folio->mapping->host; struct inode *cow_inode = F2FS_I(inode)->cow_inode; pgoff_t index = folio->index; int err = 0; block_t ori_blk_addr = NULL_ADDR; /* If pos is beyond the end of file, reserve a new block in COW inode */ if ((pos & PAGE_MASK) >= i_size_read(inode)) goto reserve_block; /* Look for the block in COW inode first */ err = __find_data_block(cow_inode, index, blk_addr); if (err) { return err; } else if (*blk_addr != NULL_ADDR) { *use_cow = true; return 0; } if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE)) goto reserve_block; /* Look for the block in the original inode */ err = __find_data_block(inode, index, &ori_blk_addr); if (err) return err; reserve_block: /* Finally, we should reserve a new block in COW inode for the update */ err = __reserve_data_block(cow_inode, index, blk_addr, node_changed); if (err) return err; inc_atomic_write_cnt(inode); if (ori_blk_addr != NULL_ADDR) *blk_addr = ori_blk_addr; return 0; } static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; bool need_balance = false; bool use_cow = false; block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len); if (!f2fs_is_checkpoint_ready(sbi)) { err = -ENOSPC; goto fail; } /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: * folio_lock(folio #0) -> folio_lock(inode_page) */ if (index != 0) { err = f2fs_convert_inline_inode(inode); if (err) goto fail; } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { int ret; struct page *page; *fsdata = NULL; if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode))) goto repeat; ret = f2fs_prepare_compress_overwrite(inode, &page, index, fsdata); if (ret < 0) { err = ret; goto fail; } else if (ret) { *foliop = page_folio(page); return 0; } } #endif repeat: /* * Do not use FGP_STABLE to avoid deadlock. * Will wait that below with our IO control. */ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto fail; } /* TODO: cluster can be compressed due to race with .writepage */ *foliop = folio; if (f2fs_is_atomic_file(inode)) err = prepare_atomic_write_begin(sbi, folio, pos, len, &blkaddr, &need_balance, &use_cow); else err = prepare_write_begin(sbi, folio, pos, len, &blkaddr, &need_balance); if (err) goto put_folio; if (need_balance && !IS_NOQUOTA(inode) && has_not_enough_free_secs(sbi, 0, 0)) { folio_unlock(folio); f2fs_balance_fs(sbi, true); folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); goto repeat; } } f2fs_wait_on_page_writeback(&folio->page, DATA, false, true); if (len == folio_size(folio) || folio_test_uptodate(folio)) return 0; if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) && !f2fs_verity_in_progress(inode)) { folio_zero_segment(folio, len, folio_size(folio)); return 0; } if (blkaddr == NEW_ADDR) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); } else { if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ)) { err = -EFSCORRUPTED; goto put_folio; } err = f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode : inode, folio, blkaddr, 0, true); if (err) goto put_folio; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; goto put_folio; } } return 0; put_folio: folio_unlock(folio); folio_put(folio); fail: f2fs_write_failed(inode, pos + len); return err; } static int f2fs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; trace_f2fs_write_end(inode, pos, len, copied); /* * This should be come from len == PAGE_SIZE, and we expect copied * should be PAGE_SIZE. Otherwise, we treat it with zero copied and * let generic_perform_write() try to copy data again through copied=0. */ if (!folio_test_uptodate(folio)) { if (unlikely(copied != len)) copied = 0; else folio_mark_uptodate(folio); } #ifdef CONFIG_F2FS_FS_COMPRESSION /* overwrite compressed file */ if (f2fs_compressed_file(inode) && fsdata) { f2fs_compress_write_end(inode, fsdata, folio->index, copied); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) f2fs_i_size_write(inode, pos + copied); return copied; } #endif if (!copied) goto unlock_out; folio_mark_dirty(folio); if (f2fs_is_atomic_file(inode)) set_page_private_atomic(folio_page(folio, 0)); if (pos + copied > i_size_read(inode) && !f2fs_verity_in_progress(inode)) { f2fs_i_size_write(inode, pos + copied); if (f2fs_is_atomic_file(inode)) f2fs_i_size_write(F2FS_I(inode)->cow_inode, pos + copied); } unlock_out: folio_unlock(folio); folio_put(folio); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct inode *inode = folio->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && (offset || length != folio_size(folio))) return; if (folio_test_dirty(folio)) { if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); } else { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } } clear_page_private_all(&folio->page); } bool f2fs_release_folio(struct folio *folio, gfp_t wait) { /* If this is dirty folio, keep private data */ if (folio_test_dirty(folio)) return false; clear_page_private_all(&folio->page); return true; } static bool f2fs_dirty_data_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; trace_f2fs_set_page_dirty(folio, DATA); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio)); if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } return false; } static sector_t f2fs_bmap_compress(struct inode *inode, sector_t block) { #ifdef CONFIG_F2FS_FS_COMPRESSION struct dnode_of_data dn; sector_t start_idx, blknr = 0; int ret; start_idx = round_down(block, F2FS_I(inode)->i_cluster_size); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) return 0; if (dn.data_blkaddr != COMPRESS_ADDR) { dn.ofs_in_node += block - start_idx; blknr = f2fs_data_blkaddr(&dn); if (!__is_valid_data_blkaddr(blknr)) blknr = 0; } f2fs_put_dnode(&dn); return blknr; #else return 0; #endif } static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t blknr = 0; if (f2fs_has_inline_data(inode)) goto out; /* make sure allocating whole blocks */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) filemap_write_and_wait(mapping); /* Block number less than F2FS MAX BLOCKS */ if (unlikely(block >= max_file_blocks(inode))) goto out; if (f2fs_compressed_file(inode)) { blknr = f2fs_bmap_compress(inode, block); } else { struct f2fs_map_blocks map; memset(&map, 0, sizeof(map)); map.m_lblk = block; map.m_len = 1; map.m_next_pgofs = NULL; map.m_seg_type = NO_CHECK_TYPE; if (!f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_BMAP)) blknr = map.m_pblk; } out: trace_f2fs_bmap(inode, block, blknr); return blknr; } #ifdef CONFIG_SWAP static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int blkcnt) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int blkofs; unsigned int blk_per_sec = BLKS_PER_SEC(sbi); unsigned int end_blk = start_blk + blkcnt - 1; unsigned int secidx = start_blk / blk_per_sec; unsigned int end_sec; int ret = 0; if (!blkcnt) return 0; end_sec = end_blk / blk_per_sec; f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); set_inode_flag(inode, FI_OPU_WRITE); for (; secidx <= end_sec; secidx++) { unsigned int blkofs_end = secidx == end_sec ? end_blk % blk_per_sec : blk_per_sec - 1; f2fs_down_write(&sbi->pin_sem); ret = f2fs_allocate_pinning_section(sbi); if (ret) { f2fs_up_write(&sbi->pin_sem); break; } set_inode_flag(inode, FI_SKIP_WRITES); for (blkofs = 0; blkofs <= blkofs_end; blkofs++) { struct page *page; unsigned int blkidx = secidx * blk_per_sec + blkofs; page = f2fs_get_lock_data_page(inode, blkidx, true); if (IS_ERR(page)) { f2fs_up_write(&sbi->pin_sem); ret = PTR_ERR(page); goto done; } set_page_dirty(page); f2fs_put_page(page, 1); } clear_inode_flag(inode, FI_SKIP_WRITES); ret = filemap_fdatawrite(inode->i_mapping); f2fs_up_write(&sbi->pin_sem); if (ret) break; } done: clear_inode_flag(inode, FI_SKIP_WRITES); clear_inode_flag(inode, FI_OPU_WRITE); clear_inode_flag(inode, FI_ALIGNED_WRITE); filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int check_swap_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t cur_lblock; block_t last_lblock; block_t pblock; block_t lowest_pblock = -1; block_t highest_pblock = 0; int nr_extents = 0; unsigned int nr_pblocks; unsigned int blks_per_sec = BLKS_PER_SEC(sbi); unsigned int not_aligned = 0; int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try * to be very smart. */ cur_lblock = 0; last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; retry: cond_resched(); memset(&map, 0, sizeof(map)); map.m_lblk = cur_lblock; map.m_len = last_lblock - cur_lblock; map.m_next_pgofs = NULL; map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; map.m_may_create = false; ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* hole */ if (!(map.m_flags & F2FS_MAP_FLAGS)) { f2fs_err(sbi, "Swapfile has holes"); ret = -EINVAL; goto out; } pblock = map.m_pblk; nr_pblocks = map.m_len; if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || nr_pblocks % blks_per_sec || !f2fs_valid_pinned_area(sbi, pblock)) { bool last_extent = false; not_aligned++; nr_pblocks = roundup(nr_pblocks, blks_per_sec); if (cur_lblock + nr_pblocks > sis->max) nr_pblocks -= blks_per_sec; /* this extent is last one */ if (!nr_pblocks) { nr_pblocks = last_lblock - cur_lblock; last_extent = true; } ret = f2fs_migrate_blocks(inode, cur_lblock, nr_pblocks); if (ret) { if (ret == -ENOENT) ret = -EINVAL; goto out; } if (!last_extent) goto retry; } if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; if (cur_lblock) { /* exclude the header page */ if (pblock < lowest_pblock) lowest_pblock = pblock; if (pblock + nr_pblocks - 1 > highest_pblock) highest_pblock = pblock + nr_pblocks - 1; } /* * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks */ ret = add_swap_extent(sis, cur_lblock, nr_pblocks, pblock); if (ret < 0) goto out; nr_extents += ret; cur_lblock += nr_pblocks; } ret = nr_extents; *span = 1 + highest_pblock - lowest_pblock; if (cur_lblock == 0) cur_lblock = 1; /* force Empty message */ sis->max = cur_lblock; sis->pages = cur_lblock - 1; out: if (not_aligned) f2fs_warn(sbi, "Swapfile (%u) is not align to section: 1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate(%lu * N)", not_aligned, blks_per_sec * F2FS_BLKSIZE); return ret; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; if (!S_ISREG(inode->i_mode)) return -EINVAL; if (f2fs_readonly(sbi->sb)) return -EROFS; if (f2fs_lfs_mode(sbi) && !f2fs_sb_has_blkzoned(sbi)) { f2fs_err(sbi, "Swapfile not supported in LFS mode"); return -EINVAL; } ret = f2fs_convert_inline_inode(inode); if (ret) return ret; if (!f2fs_disable_compressed_file(inode)) return -EINVAL; ret = filemap_fdatawrite(inode->i_mapping); if (ret < 0) return ret; f2fs_precache_extents(inode); ret = check_swap_activate(sis, file, span); if (ret < 0) return ret; stat_inc_swapfile_inode(inode); set_inode_flag(inode, FI_PIN_FILE); f2fs_update_time(sbi, REQ_TIME); return ret; } static void f2fs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); stat_dec_swapfile_inode(inode); clear_inode_flag(inode, FI_PIN_FILE); } #else static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return -EOPNOTSUPP; } static void f2fs_swap_deactivate(struct file *file) { } #endif const struct address_space_operations f2fs_dblock_aops = { .read_folio = f2fs_read_data_folio, .readahead = f2fs_readahead, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, .dirty_folio = f2fs_dirty_data_folio, .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, }; void f2fs_clear_page_cache_dirty_tag(struct folio *folio) { struct address_space *mapping = folio->mapping; unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } int __init f2fs_init_post_read_processing(void) { bio_post_read_ctx_cache = kmem_cache_create("f2fs_bio_post_read_ctx", sizeof(struct bio_post_read_ctx), 0, 0, NULL); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void f2fs_destroy_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi) { if (!f2fs_sb_has_encrypt(sbi) && !f2fs_sb_has_verity(sbi) && !f2fs_sb_has_compression(sbi)) return 0; sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq", WQ_UNBOUND | WQ_HIGHPRI, num_online_cpus()); return sbi->post_read_wq ? 0 : -ENOMEM; } void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) { if (sbi->post_read_wq) destroy_workqueue(sbi->post_read_wq); } int __init f2fs_init_bio_entry_cache(void) { bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); return bio_entry_slab ? 0 : -ENOMEM; } void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct f2fs_map_blocks map = {}; pgoff_t next_pgofs = 0; int err; map.m_lblk = F2FS_BYTES_TO_BLK(offset); map.m_len = F2FS_BYTES_TO_BLK(offset + length - 1) - map.m_lblk + 1; map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); /* * If the blocks being overwritten are already allocated, * f2fs_map_lock and f2fs_balance_fs are not necessary. */ if ((flags & IOMAP_WRITE) && !f2fs_overwrite_io(inode, offset, length)) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); if (err) return err; iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk); /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); /* * We should never see delalloc or compressed extents here based on * prior flushing and checks. */ if (WARN_ON_ONCE(map.m_pblk == COMPRESS_ADDR)) return -EINVAL; if (map.m_flags & F2FS_MAP_MAPPED) { if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) return -EINVAL; iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); } else { if (flags & IOMAP_WRITE) return -ENOTBLK; if (map.m_pblk == NULL_ADDR) { iomap->length = F2FS_BLK_TO_BYTES(next_pgofs) - iomap->offset; iomap->type = IOMAP_HOLE; } else if (map.m_pblk == NEW_ADDR) { iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_UNWRITTEN; } else { f2fs_bug_on(F2FS_I_SB(inode), 1); } iomap->addr = IOMAP_NULL_ADDR; } if (map.m_flags & F2FS_MAP_NEW) iomap->flags |= IOMAP_F_NEW; if ((inode->i_state & I_DIRTY_DATASYNC) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; return 0; } const struct iomap_ops f2fs_iomap_ops = { .iomap_begin = f2fs_iomap_begin, }; |
2 16 9 16 16 16 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_OPTS_H #define _BCACHEFS_OPTS_H #include <linux/bug.h> #include <linux/log2.h> #include <linux/string.h> #include <linux/sysfs.h> #include "bcachefs_format.h" struct bch_fs; extern const char * const bch2_error_actions[]; extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const __bch2_csum_types[]; extern const char * const __bch2_csum_opts[]; extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; extern const char * const __bch2_data_types[]; extern const char * const bch2_member_states[]; extern const char * const bch2_d_types[]; void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); void bch2_prt_data_type(struct printbuf *, enum bch_data_type); void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); static inline const char *bch2_d_type_str(unsigned d_type) { return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; } /* * Mount options; we also store defaults in the superblock. * * Also exposed via sysfs: if an option is writeable, and it's also stored in * the superblock, changing it via sysfs (currently? might change this) also * updates the superblock. * * We store options as signed integers, where -1 means undefined. This means we * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only * apply the options from that struct that are defined. */ /* When can be set: */ enum opt_flags { OPT_FS = BIT(0), /* Filesystem option */ OPT_DEVICE = BIT(1), /* Device option */ OPT_INODE = BIT(2), /* Inode option */ OPT_FORMAT = BIT(3), /* May be specified at format time */ OPT_MOUNT = BIT(4), /* May be specified at mount time */ OPT_RUNTIME = BIT(5), /* May be specified at runtime */ OPT_HUMAN_READABLE = BIT(6), OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */ OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */ OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ OPT_HIDDEN = BIT(11), }; enum opt_type { BCH_OPT_BOOL, BCH_OPT_UINT, BCH_OPT_STR, BCH_OPT_BITFIELD, BCH_OPT_FN, }; struct bch_opt_fn { int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); int (*validate)(u64, struct printbuf *); }; /** * x(name, shortopt, type, in mem type, mode, sb_opt) * * @name - name of mount option, sysfs attribute, and struct bch_opts * member * * @mode - when opt may be set * * @sb_option - name of corresponding superblock option * * @type - one of OPT_BOOL, OPT_UINT, OPT_STR */ /* * XXX: add fields for * - default value * - helptext */ #ifdef __KERNEL__ #define RATELIMIT_ERRORS_DEFAULT true #else #define RATELIMIT_ERRORS_DEFAULT false #endif #ifdef CONFIG_BCACHEFS_DEBUG #define BCACHEFS_VERBOSE_DEFAULT true #else #define BCACHEFS_VERBOSE_DEFAULT false #endif #define BCH_FIX_ERRORS_OPTS() \ x(exit, 0) \ x(yes, 1) \ x(no, 2) \ x(ask, 3) enum fsck_err_opts { #define x(t, n) FSCK_FIX_##t, BCH_FIX_ERRORS_OPTS() #undef x }; #define BCH_OPTS() \ x(block_size, u16, \ OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 16), \ BCH_SB_BLOCK_SIZE, 4 << 10, \ "size", NULL) \ x(btree_node_size, u32, \ OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 20), \ BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ x(write_error_timeout, u16, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, 300), \ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_META_REPLICAS_WANT, 1, \ "#", "Number of metadata replicas") \ x(data_replicas, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_DATA_REPLICAS_WANT, 1, \ "#", "Number of data replicas") \ x(metadata_replicas_required, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_META_REPLICAS_REQ, 1, \ "#", NULL) \ x(data_replicas_required, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_DATA_REPLICAS_REQ, 1, \ "#", NULL) \ x(encoded_extent_max, u32, \ OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ OPT_UINT(4096, 2U << 20), \ BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(checksum_err_retry_nr, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(0, 32), \ BCH_SB_CSUM_ERR_RETRY_NR, 3, \ NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(background_compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(str_hash, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_str_hash_opts), \ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ x(foreground_target, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_FOREGROUND_TARGET, 0, \ "(target)", "Device or label for foreground writes") \ x(background_target, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_BACKGROUND_TARGET, 0, \ "(target)", "Device or label to move data to in the background")\ x(promote_target, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0, \ "(target)", "Device or label to promote data to on read") \ x(erasure_code, u16, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(casefold, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT, \ OPT_BOOL(), \ BCH_SB_CASEFOLD, false, \ NULL, "Dirent lookups are casefolded") \ x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ x(shard_inode_numbers_bits, u8, \ OPT_FS|OPT_FORMAT, \ OPT_UINT(0, 8), \ BCH_SB_SHARD_INUMS_NBITS, 0, \ NULL, "Shard new inode numbers by CPU id") \ x(inodes_use_key_cache, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_INODES_USE_KEY_CACHE, true, \ NULL, "Use the btree key cache for the inodes btree") \ x(btree_node_mem_ptr_optimization, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ BCH_SB_GC_RESERVE, 8, \ "%", "Percentage of disk space to reserve for copygc")\ x(gc_reserve_bytes, u64, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, U64_MAX), \ BCH_SB_GC_RESERVE_BYTES, 0, \ "%", "Amount of disk space to reserve for copygc\n" \ "Takes precedence over gc_reserve_percent if set")\ x(root_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_UINT(0, 100), \ BCH_SB_ROOT_RESERVE, 0, \ "%", "Percentage of disk space to reserve for superuser")\ x(wide_macs, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_128_BIT_MACS, false, \ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ x(inline_data, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ x(promote_whole_extents, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \ NULL, "Promote whole extents, instead of just part being read")\ x(acl, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_POSIX_ACL, true, \ NULL, "Enable POSIX acls") \ x(usrquota, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_USRQUOTA, false, \ NULL, "Enable user quotas") \ x(grpquota, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_GRPQUOTA, false, \ NULL, "Enable group quotas") \ x(prjquota, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_PRJQUOTA, false, \ NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in degraded mode") \ x(very_degraded, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ x(no_splitbrain_check, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't kick drives out when splitbrain detected")\ x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ NULL, "Extra debugging information during mount/recovery")\ x(journal_flush_delay, u32, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, U32_MAX), \ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ NULL, "Delay in milliseconds before automatic journal commits")\ x(journal_flush_disabled, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ NULL, "Disable journal flush on sync/fsync\n" \ "If enabled, writes can be lost, but only since the\n"\ "last journal write (default 1 second)") \ x(journal_reclaim_delay, u32, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(0, U32_MAX), \ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ NULL, "Delay in milliseconds before automatic journal reclaim")\ x(move_bytes_in_flight, u32, \ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1024, U32_MAX), \ BCH2_NO_SB_OPT, 1U << 20, \ NULL, "Maximum Amount of IO to keep in flight by the move path")\ x(move_ios_in_flight, u32, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, 1024), \ BCH2_NO_SB_OPT, 32, \ NULL, "Maximum number of IOs to keep in flight by the move path")\ x(fsck, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ x(fsck_memory_usage_percent, u8, \ OPT_FS|OPT_MOUNT, \ OPT_UINT(20, 70), \ BCH2_NO_SB_OPT, 50, \ NULL, "Maximum percentage of system ram fsck is allowed to pin")\ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_FN(bch2_opt_fix_errors), \ BCH2_NO_SB_OPT, FSCK_FIX_exit, \ NULL, "Fix errors during fsck without asking") \ x(ratelimit_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ NULL, "Ratelimit error messages during fsck") \ x(nochanges, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Super read only mode - no writes at all will be issued,\n"\ "even if we have to replay the journal") \ x(norecovery, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Exit recovery immediately prior to journal replay")\ x(recovery_passes, u64, \ OPT_FS|OPT_MOUNT, \ OPT_BITFIELD(bch2_recovery_passes), \ BCH2_NO_SB_OPT, 0, \ NULL, "Recovery passes to run explicitly") \ x(recovery_passes_exclude, u64, \ OPT_FS|OPT_MOUNT, \ OPT_BITFIELD(bch2_recovery_passes), \ BCH2_NO_SB_OPT, 0, \ NULL, "Recovery passes to exclude") \ x(recovery_pass_last, u8, \ OPT_FS|OPT_MOUNT, \ OPT_STR_NOLIMIT(bch2_recovery_passes), \ BCH2_NO_SB_OPT, 0, \ NULL, "Exit recovery after specified pass") \ x(retain_recovery_info, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't free journal entries/keys, scanned btree nodes after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Read all journal entries, not just dirty ones")\ x(read_journal_only, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Only read the journal, skip the rest of recovery")\ x(journal_transaction_names, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ NULL, "Log transaction function names in journal") \ x(allocator_stuck_timeout, u16, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(0, U16_MAX), \ BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30, \ NULL, "Default timeout in seconds for stuck allocator messages")\ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't open device in exclusive mode") \ x(direct_io, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Use O_DIRECT (userspace only)") \ x(sb, u64, \ OPT_MOUNT, \ OPT_UINT(0, S64_MAX), \ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ OPT_FS|OPT_MOUNT|OPT_HIDDEN, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(nostart, u8, \ 0, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don\'t start filesystem, only open devices") \ x(reconstruct_alloc, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ OPT_FS|OPT_MOUNT, \ OPT_STR(bch2_version_upgrade_opts), \ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ x(stdio, u64, \ 0, \ OPT_UINT(0, S64_MAX), \ BCH2_NO_SB_OPT, false, \ NULL, "Pointer to a struct stdio_redirect") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(nocow, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ OPT_BOOL(), \ BCH_SB_NOCOW, false, \ NULL, "Nocow mode: Writes will be done in place when possible.\n"\ "Snapshots and reflink will still caused writes to be COW\n"\ "Implicitly disables data checksumming, compression and encryption")\ x(nocow_enabled, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ x(copygc_enabled, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable copygc: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ x(rebalance_enabled, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Skip submit_bio() for data reads and writes, " \ "for performance testing purposes") \ x(state, u64, \ OPT_DEVICE|OPT_RUNTIME, \ OPT_STR(bch2_member_states), \ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ "state", "rw,ro,failed,spare") \ x(bucket_size, u32, \ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, S64_MAX), \ BCH_MEMBER_BUCKET_SIZE, 0, \ "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ BCH_MEMBER_DURABILITY, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ x(data_allowed, u8, \ OPT_DEVICE, \ OPT_BITFIELD(__bch2_data_types), \ BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ x(discard, u8, \ OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_MEMBER_DISCARD, true, \ NULL, "Enable discard/TRIM support") \ x(btree_node_prefetch, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() #undef x #define x(_name, _bits, ...) _bits _name; BCH_OPTS() #undef x }; struct bch2_opts_parse { struct bch_opts opts; /* to save opts that can't be parsed before the FS is opened: */ struct printbuf parse_later; }; static const __maybe_unused struct bch_opts bch2_opts_default = { #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ._name##_defined = true, \ ._name = _default, \ BCH_OPTS() #undef x }; #define opt_defined(_opts, _name) ((_opts)._name##_defined) #define opt_get(_opts, _name) \ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) #define opt_set(_opts, _name, _v) \ do { \ (_opts)._name##_defined = true; \ (_opts)._name = _v; \ } while (0) static inline struct bch_opts bch2_opts_empty(void) { return (struct bch_opts) { 0 }; } void bch2_opts_apply(struct bch_opts *, struct bch_opts); enum bch_opt_id { #define x(_name, ...) Opt_##_name, BCH_OPTS() #undef x bch2_opts_nr }; struct bch_fs; struct printbuf; struct bch_option { struct attribute attr; enum opt_type type; enum opt_flags flags; u64 min, max; const char * const *choices; struct bch_opt_fn fn; const char *hint; const char *help; u64 (*get_sb)(const struct bch_sb *); void (*set_sb)(struct bch_sb *, u64); u64 (*get_member)(const struct bch_member *); void (*set_member)(struct bch_member *, u64); }; extern const struct bch_option bch2_opt_table[]; bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); struct bch_dev; void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *, struct printbuf *); #define OPT_SHOW_FULL_LIST (1 << 0) #define OPT_SHOW_MOUNT_STYLE (1 << 1) void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, const struct bch_option *, u64, unsigned); void bch2_opts_to_text(struct printbuf *, struct bch_opts, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, char *, bool); /* inode opts: */ struct bch_io_opts { #define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() #undef x #define x(_name, _bits) u64 _name##_from_inode:1; BCH_INODE_OPTS() #undef x }; static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) { if (!opts->background_target) opts->background_target = opts->foreground_target; if (!opts->background_compression) opts->background_compression = opts->compression; if (opts->nocow) { opts->compression = opts->background_compression = 0; opts->data_checksum = 0; opts->erasure_code = 0; } } struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); #endif /* _BCACHEFS_OPTS_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Resilient Queued Spin Lock defines * * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates. * * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com> */ #ifndef __LINUX_RQSPINLOCK_H #define __LINUX_RQSPINLOCK_H #include "../locking/qspinlock.h" /* * try_cmpxchg_tail - Return result of cmpxchg of tail word with a new value * @lock: Pointer to queued spinlock structure * @tail: The tail to compare against * @new_tail: The new queue tail code word * Return: Bool to indicate whether the cmpxchg operation succeeded * * This is used by the head of the wait queue to clean up the queue. * Provides relaxed ordering, since observers only rely on initialized * state of the node which was made visible through the xchg_tail operation, * i.e. through the smp_wmb preceding xchg_tail. * * We avoid using 16-bit cmpxchg, which is not available on all architectures. */ static __always_inline bool try_cmpxchg_tail(struct qspinlock *lock, u32 tail, u32 new_tail) { u32 old, new; old = atomic_read(&lock->val); do { /* * Is the tail part we compare to already stale? Fail. */ if ((old & _Q_TAIL_MASK) != tail) return false; /* * Encode latest locked/pending state for new tail. */ new = (old & _Q_LOCKED_PENDING_MASK) | new_tail; } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); return true; } #endif /* __LINUX_RQSPINLOCK_H */ |
433 58 410 2 430 430 410 59 58 2 2 430 430 42 433 2104 435 42 433 59 433 432 431 26 26 433 79 433 435 62 434 398 398 432 432 70 45 44 2096 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 | // SPDX-License-Identifier: GPL-2.0 /* * A fast, small, non-recursive O(n log n) sort for the Linux kernel * * This performs n*log2(n) + 0.37*n + o(n) comparisons on average, * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case. * * Quicksort manages n*log2(n) - 1.26*n for random inputs (1.63*n * better) at the expense of stack usage and much larger code to avoid * quicksort's O(n^2) worst case. */ #include <linux/types.h> #include <linux/export.h> #include <linux/sort.h> /** * is_aligned - is this pointer & size okay for word-wide copying? * @base: pointer to data * @size: size of each element * @align: required alignment (typically 4 or 8) * * Returns true if elements can be copied using word loads and stores. * The size must be a multiple of the alignment, and the base address must * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS. * * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)" * to "if ((a | b) & mask)", so we do that by hand. */ __attribute_const__ __always_inline static bool is_aligned(const void *base, size_t size, unsigned char align) { unsigned char lsbits = (unsigned char)size; (void)base; #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS lsbits |= (unsigned char)(uintptr_t)base; #endif return (lsbits & (align - 1)) == 0; } /** * swap_words_32 - swap two elements in 32-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 4) * * Exchange the two objects in memory. This exploits base+index addressing, * which basically all CPUs have, to minimize loop overhead computations. * * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the * bottom of the loop, even though the zero flag is still valid from the * subtract (since the intervening mov instructions don't alter the flags). * Gcc 8.1.0 doesn't have that problem. */ static void swap_words_32(void *a, void *b, size_t n) { do { u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; } while (n); } /** * swap_words_64 - swap two elements in 64-bit chunks * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size (must be a multiple of 8) * * Exchange the two objects in memory. This exploits base+index * addressing, which basically all CPUs have, to minimize loop overhead * computations. * * We'd like to use 64-bit loads if possible. If they're not, emulating * one requires base+index+4 addressing which x86 has but most other * processors do not. If CONFIG_64BIT, we definitely have 64-bit loads, * but it's possible to have 64-bit loads without 64-bit pointers (e.g. * x32 ABI). Are there any cases the kernel needs to worry about? */ static void swap_words_64(void *a, void *b, size_t n) { do { #ifdef CONFIG_64BIT u64 t = *(u64 *)(a + (n -= 8)); *(u64 *)(a + n) = *(u64 *)(b + n); *(u64 *)(b + n) = t; #else /* Use two 32-bit transfers to avoid base+index+4 addressing */ u32 t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; t = *(u32 *)(a + (n -= 4)); *(u32 *)(a + n) = *(u32 *)(b + n); *(u32 *)(b + n) = t; #endif } while (n); } /** * swap_bytes - swap two elements a byte at a time * @a: pointer to the first element to swap * @b: pointer to the second element to swap * @n: element size * * This is the fallback if alignment doesn't allow using larger chunks. */ static void swap_bytes(void *a, void *b, size_t n) { do { char t = ((char *)a)[--n]; ((char *)a)[n] = ((char *)b)[n]; ((char *)b)[n] = t; } while (n); } /* * The values are arbitrary as long as they can't be confused with * a pointer, but small integers make for the smallest compare * instructions. */ #define SWAP_WORDS_64 (swap_r_func_t)0 #define SWAP_WORDS_32 (swap_r_func_t)1 #define SWAP_BYTES (swap_r_func_t)2 #define SWAP_WRAPPER (swap_r_func_t)3 struct wrapper { cmp_func_t cmp; swap_func_t swap; }; /* * The function pointer is last to make tail calls most efficient if the * compiler decides not to inline this function. */ static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { if (swap_func == SWAP_WRAPPER) { ((const struct wrapper *)priv)->swap(a, b, (int)size); return; } if (swap_func == SWAP_WORDS_64) swap_words_64(a, b, size); else if (swap_func == SWAP_WORDS_32) swap_words_32(a, b, size); else if (swap_func == SWAP_BYTES) swap_bytes(a, b, size); else swap_func(a, b, (int)size, priv); } #define _CMP_WRAPPER ((cmp_r_func_t)0L) static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) { if (cmp == _CMP_WRAPPER) return ((const struct wrapper *)priv)->cmp(a, b); return cmp(a, b, priv); } /** * parent - given the offset of the child, find the offset of the parent. * @i: the offset of the heap element whose parent is sought. Non-zero. * @lsbit: a precomputed 1-bit mask, equal to "size & -size" * @size: size of each element * * In terms of array indexes, the parent of element j = @i/@size is simply * (j-1)/2. But when working in byte offsets, we can't use implicit * truncation of integer divides. * * Fortunately, we only need one bit of the quotient, not the full divide. * @size has a least significant bit. That bit will be clear if @i is * an even multiple of @size, and set if it's an odd multiple. * * Logically, we're doing "if (i & lsbit) i -= size;", but since the * branch is unpredictable, it's done with a bit of clever branch-free * code instead. */ __attribute_const__ __always_inline static size_t parent(size_t i, unsigned int lsbit, size_t size) { i -= size; i -= size & -(i & lsbit); return i / 2; } #include <linux/sched.h> static void __sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, const void *priv, bool may_schedule) { /* pre-scale counters for performance */ size_t n = num * size, a = (num/2) * size; const unsigned int lsbit = size & -size; /* Used to find parent */ size_t shift = 0; if (!a) /* num < 2 || size == 0 */ return; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) swap_func = NULL; if (!swap_func) { if (is_aligned(base, size, 8)) swap_func = SWAP_WORDS_64; else if (is_aligned(base, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* * Loop invariants: * 1. elements [a,n) satisfy the heap property (compare greater than * all of their children), * 2. elements [n,num*size) are sorted, and * 3. a <= b <= c <= d <= n (whenever they are valid). */ for (;;) { size_t b, c, d; if (a) /* Building heap: sift down a */ a -= size << shift; else if (n > 3 * size) { /* Sorting: Extract two largest elements */ n -= size; do_swap(base, base + n, size, swap_func, priv); shift = do_cmp(base + size, base + 2 * size, cmp_func, priv) <= 0; a = size << shift; n -= size; do_swap(base + a, base + n, size, swap_func, priv); } else { /* Sort complete */ break; } /* * Sift element at "a" down into heap. This is the * "bottom-up" variant, which significantly reduces * calls to cmp_func(): we find the sift-down path all * the way to the leaves (one compare per level), then * backtrack to find where to insert the target element. * * Because elements tend to sift down close to the leaves, * this uses fewer compares than doing two per level * on the way down. (A bit more than half as many on * average, 3/4 worst-case.) */ for (b = a; c = 2*b + size, (d = c + size) < n;) b = do_cmp(base + c, base + d, cmp_func, priv) > 0 ? c : d; if (d == n) /* Special case last leaf with no sibling */ b = c; /* Now backtrack from "b" to the correct location for "a" */ while (b != a && do_cmp(base + a, base + b, cmp_func, priv) >= 0) b = parent(b, lsbit, size); c = b; /* Where "a" belongs */ while (b != a) { /* Shift it into place */ b = parent(b, lsbit, size); do_swap(base + b, base + c, size, swap_func, priv); } if (may_schedule) cond_resched(); } n -= size; do_swap(base, base + n, size, swap_func, priv); if (n == size * 2 && do_cmp(base, base + size, cmp_func, priv) > 0) do_swap(base, base + size, size, swap_func, priv); } /** * sort_r - sort an array of elements * @base: pointer to data to sort * @num: number of elements * @size: size of each element * @cmp_func: pointer to comparison function * @swap_func: pointer to swap function or NULL * @priv: third argument passed to comparison function * * This function does a heapsort on the given array. You may provide * a swap_func function if you need to do something more than a memory * copy (e.g. fix up pointers or auxiliary data), but the built-in swap * avoids a slow retpoline and so is significantly faster. * * The comparison function must adhere to specific mathematical * properties to ensure correct and stable sorting: * - Antisymmetry: cmp_func(a, b) must return the opposite sign of * cmp_func(b, a). * - Transitivity: if cmp_func(a, b) <= 0 and cmp_func(b, c) <= 0, then * cmp_func(a, c) <= 0. * * Sorting time is O(n log n) both on average and worst-case. While * quicksort is slightly faster on average, it suffers from exploitable * O(n*n) worst-case behavior and extra memory requirements that make * it less suitable for kernel use. */ void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, const void *priv) { __sort_r(base, num, size, cmp_func, swap_func, priv, false); } EXPORT_SYMBOL(sort_r); /** * sort_r_nonatomic - sort an array of elements, with cond_resched * @base: pointer to data to sort * @num: number of elements * @size: size of each element * @cmp_func: pointer to comparison function * @swap_func: pointer to swap function or NULL * @priv: third argument passed to comparison function * * Same as sort_r, but preferred for larger arrays as it does a periodic * cond_resched(). */ void sort_r_nonatomic(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, swap_r_func_t swap_func, const void *priv) { __sort_r(base, num, size, cmp_func, swap_func, priv, true); } EXPORT_SYMBOL(sort_r_nonatomic); void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) { struct wrapper w = { .cmp = cmp_func, .swap = swap_func, }; return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, false); } EXPORT_SYMBOL(sort); void sort_nonatomic(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) { struct wrapper w = { .cmp = cmp_func, .swap = swap_func, }; return __sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w, true); } EXPORT_SYMBOL(sort_nonatomic); |
135 136 136 93 55 92 93 77 50 91 33 33 27 7 26 1 26 26 26 26 26 26 26 26 26 26 29 26 29 26 28 29 86 61 61 61 34 34 138 135 136 14 106 106 48 103 103 104 103 101 60 60 103 79 80 34 103 106 107 68 68 67 66 32 66 62 66 68 68 21 20 18 20 20 68 68 66 32 2 33 33 32 68 45 44 69 64 53 53 37 53 53 68 3 30 27 33 35 16 7 16 3 13 12 52 21 20 13 21 20 3 2 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 | // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * MIDI device handlers * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include <sound/asoundef.h> #include "seq_oss_midi.h" #include "seq_oss_readq.h" #include "seq_oss_timer.h" #include "seq_oss_event.h" #include <sound/seq_midi_event.h> #include "../seq_lock.h" #include <linux/init.h> #include <linux/slab.h> #include <linux/nospec.h> /* * constants */ #define SNDRV_SEQ_OSS_MAX_MIDI_NAME 30 /* * definition of midi device record */ struct seq_oss_midi { int seq_device; /* device number */ int client; /* sequencer client number */ int port; /* sequencer port number */ unsigned int flags; /* port capability */ int opened; /* flag for opening */ unsigned char name[SNDRV_SEQ_OSS_MAX_MIDI_NAME]; struct snd_midi_event *coder; /* MIDI event coder */ struct seq_oss_devinfo *devinfo; /* assigned OSSseq device */ snd_use_lock_t use_lock; struct mutex open_mutex; }; /* * midi device table */ static int max_midi_devs; static struct seq_oss_midi *midi_devs[SNDRV_SEQ_OSS_MAX_MIDI_DEVS]; static DEFINE_SPINLOCK(register_lock); /* * prototypes */ static struct seq_oss_midi *get_mdev(int dev); static struct seq_oss_midi *get_mididev(struct seq_oss_devinfo *dp, int dev); static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev); static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev); /* * look up the existing ports * this looks a very exhausting job. */ int snd_seq_oss_midi_lookup_ports(int client) { struct snd_seq_client_info *clinfo __free(kfree) = NULL; struct snd_seq_port_info *pinfo __free(kfree) = NULL; clinfo = kzalloc(sizeof(*clinfo), GFP_KERNEL); pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL); if (!clinfo || !pinfo) return -ENOMEM; clinfo->client = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_CLIENT, clinfo) == 0) { if (clinfo->client == client) continue; /* ignore myself */ pinfo->addr.client = clinfo->client; pinfo->addr.port = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_PORT, pinfo) == 0) snd_seq_oss_midi_check_new_port(pinfo); } return 0; } /* */ static struct seq_oss_midi * get_mdev(int dev) { struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); mdev = midi_devs[dev]; if (mdev) snd_use_lock_use(&mdev->use_lock); spin_unlock_irqrestore(®ister_lock, flags); return mdev; } /* * look for the identical slot */ static struct seq_oss_midi * find_slot(int client, int port) { int i; struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev && mdev->client == client && mdev->port == port) { /* found! */ snd_use_lock_use(&mdev->use_lock); spin_unlock_irqrestore(®ister_lock, flags); return mdev; } } spin_unlock_irqrestore(®ister_lock, flags); return NULL; } #define PERM_WRITE (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_SUBS_WRITE) #define PERM_READ (SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ) /* * register a new port if it doesn't exist yet */ int snd_seq_oss_midi_check_new_port(struct snd_seq_port_info *pinfo) { int i; struct seq_oss_midi *mdev; unsigned long flags; /* the port must include generic midi */ if (! (pinfo->type & SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC)) return 0; /* either read or write subscribable */ if ((pinfo->capability & PERM_WRITE) != PERM_WRITE && (pinfo->capability & PERM_READ) != PERM_READ) return 0; /* * look for the identical slot */ mdev = find_slot(pinfo->addr.client, pinfo->addr.port); if (mdev) { /* already exists */ snd_use_lock_free(&mdev->use_lock); return 0; } /* * allocate midi info record */ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return -ENOMEM; /* copy the port information */ mdev->client = pinfo->addr.client; mdev->port = pinfo->addr.port; mdev->flags = pinfo->capability; mdev->opened = 0; snd_use_lock_init(&mdev->use_lock); mutex_init(&mdev->open_mutex); /* copy and truncate the name of synth device */ strscpy(mdev->name, pinfo->name, sizeof(mdev->name)); /* create MIDI coder */ if (snd_midi_event_new(MAX_MIDI_EVENT_BUF, &mdev->coder) < 0) { pr_err("ALSA: seq_oss: can't malloc midi coder\n"); kfree(mdev); return -ENOMEM; } /* OSS sequencer adds running status to all sequences */ snd_midi_event_no_status(mdev->coder, 1); /* * look for en empty slot */ spin_lock_irqsave(®ister_lock, flags); for (i = 0; i < max_midi_devs; i++) { if (midi_devs[i] == NULL) break; } if (i >= max_midi_devs) { if (max_midi_devs >= SNDRV_SEQ_OSS_MAX_MIDI_DEVS) { spin_unlock_irqrestore(®ister_lock, flags); snd_midi_event_free(mdev->coder); kfree(mdev); return -ENOMEM; } max_midi_devs++; } mdev->seq_device = i; midi_devs[mdev->seq_device] = mdev; spin_unlock_irqrestore(®ister_lock, flags); return 0; } /* * release the midi device if it was registered */ int snd_seq_oss_midi_check_exit_port(int client, int port) { struct seq_oss_midi *mdev; unsigned long flags; int index; mdev = find_slot(client, port); if (mdev) { spin_lock_irqsave(®ister_lock, flags); midi_devs[mdev->seq_device] = NULL; spin_unlock_irqrestore(®ister_lock, flags); snd_use_lock_free(&mdev->use_lock); snd_use_lock_sync(&mdev->use_lock); snd_midi_event_free(mdev->coder); kfree(mdev); } spin_lock_irqsave(®ister_lock, flags); for (index = max_midi_devs - 1; index >= 0; index--) { if (midi_devs[index]) break; } max_midi_devs = index + 1; spin_unlock_irqrestore(®ister_lock, flags); return 0; } /* * release the midi device if it was registered */ void snd_seq_oss_midi_clear_all(void) { int i; struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(®ister_lock, flags); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev) { snd_midi_event_free(mdev->coder); kfree(mdev); midi_devs[i] = NULL; } } max_midi_devs = 0; spin_unlock_irqrestore(®ister_lock, flags); } /* * set up midi tables */ void snd_seq_oss_midi_setup(struct seq_oss_devinfo *dp) { spin_lock_irq(®ister_lock); dp->max_mididev = max_midi_devs; spin_unlock_irq(®ister_lock); } /* * clean up midi tables */ void snd_seq_oss_midi_cleanup(struct seq_oss_devinfo *dp) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_close(dp, i); dp->max_mididev = 0; } /* * open all midi devices. ignore errors. */ void snd_seq_oss_midi_open_all(struct seq_oss_devinfo *dp, int file_mode) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_open(dp, i, file_mode); } /* * get the midi device information */ static struct seq_oss_midi * get_mididev(struct seq_oss_devinfo *dp, int dev) { if (dev < 0 || dev >= dp->max_mididev) return NULL; dev = array_index_nospec(dev, dp->max_mididev); return get_mdev(dev); } /* * open the midi device if not opened yet */ int snd_seq_oss_midi_open(struct seq_oss_devinfo *dp, int dev, int fmode) { int perm; struct seq_oss_midi *mdev; struct snd_seq_port_subscribe subs; int err; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; mutex_lock(&mdev->open_mutex); /* already used? */ if (mdev->opened && mdev->devinfo != dp) { err = -EBUSY; goto unlock; } perm = 0; if (is_write_mode(fmode)) perm |= PERM_WRITE; if (is_read_mode(fmode)) perm |= PERM_READ; perm &= mdev->flags; if (perm == 0) { err = -ENXIO; goto unlock; } /* already opened? */ if ((mdev->opened & perm) == perm) { err = 0; goto unlock; } perm &= ~mdev->opened; memset(&subs, 0, sizeof(subs)); if (perm & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_WRITE; } if (perm & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; subs.flags = SNDRV_SEQ_PORT_SUBS_TIMESTAMP; subs.queue = dp->queue; /* queue for timestamps */ if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_READ; } if (! mdev->opened) { err = -ENXIO; goto unlock; } mdev->devinfo = dp; err = 0; unlock: mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); return err; } /* * close the midi device if already opened */ int snd_seq_oss_midi_close(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; struct snd_seq_port_subscribe subs; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; mutex_lock(&mdev->open_mutex); if (!mdev->opened || mdev->devinfo != dp) goto unlock; memset(&subs, 0, sizeof(subs)); if (mdev->opened & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } if (mdev->opened & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } mdev->opened = 0; mdev->devinfo = NULL; unlock: mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); return 0; } /* * change seq capability flags to file mode flags */ int snd_seq_oss_midi_filemode(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; int mode; mdev = get_mididev(dp, dev); if (!mdev) return 0; mode = 0; if (mdev->opened & PERM_WRITE) mode |= SNDRV_SEQ_OSS_FILE_WRITE; if (mdev->opened & PERM_READ) mode |= SNDRV_SEQ_OSS_FILE_READ; snd_use_lock_free(&mdev->use_lock); return mode; } /* * reset the midi device and close it: * so far, only close the device. */ void snd_seq_oss_midi_reset(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return; if (! mdev->opened) { snd_use_lock_free(&mdev->use_lock); return; } if (mdev->opened & PERM_WRITE) { struct snd_seq_event ev; int c; memset(&ev, 0, sizeof(ev)); ev.dest.client = mdev->client; ev.dest.port = mdev->port; ev.queue = dp->queue; ev.source.port = dp->port; if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_SYNTH) { ev.type = SNDRV_SEQ_EVENT_SENSING; snd_seq_oss_dispatch(dp, &ev, 0, 0); } for (c = 0; c < 16; c++) { ev.type = SNDRV_SEQ_EVENT_CONTROLLER; ev.data.control.channel = c; ev.data.control.param = MIDI_CTL_ALL_NOTES_OFF; snd_seq_oss_dispatch(dp, &ev, 0, 0); if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) { ev.data.control.param = MIDI_CTL_RESET_CONTROLLERS; snd_seq_oss_dispatch(dp, &ev, 0, 0); ev.type = SNDRV_SEQ_EVENT_PITCHBEND; ev.data.control.value = 0; snd_seq_oss_dispatch(dp, &ev, 0, 0); } } } // snd_seq_oss_midi_close(dp, dev); snd_use_lock_free(&mdev->use_lock); } /* * get client/port of the specified MIDI device */ void snd_seq_oss_midi_get_addr(struct seq_oss_devinfo *dp, int dev, struct snd_seq_addr *addr) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return; addr->client = mdev->client; addr->port = mdev->port; snd_use_lock_free(&mdev->use_lock); } /* * input callback - this can be atomic */ int snd_seq_oss_midi_input(struct snd_seq_event *ev, int direct, void *private_data) { struct seq_oss_devinfo *dp = (struct seq_oss_devinfo *)private_data; struct seq_oss_midi *mdev; int rc; if (dp->readq == NULL) return 0; mdev = find_slot(ev->source.client, ev->source.port); if (!mdev) return 0; if (! (mdev->opened & PERM_READ)) { snd_use_lock_free(&mdev->use_lock); return 0; } if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) rc = send_synth_event(dp, ev, mdev->seq_device); else rc = send_midi_event(dp, ev, mdev); snd_use_lock_free(&mdev->use_lock); return rc; } /* * convert ALSA sequencer event to OSS synth event */ static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev) { union evrec ossev; memset(&ossev, 0, sizeof(ossev)); switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: ossev.v.cmd = MIDI_NOTEON; break; case SNDRV_SEQ_EVENT_NOTEOFF: ossev.v.cmd = MIDI_NOTEOFF; break; case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.cmd = MIDI_KEY_PRESSURE; break; case SNDRV_SEQ_EVENT_CONTROLLER: ossev.l.cmd = MIDI_CTL_CHANGE; break; case SNDRV_SEQ_EVENT_PGMCHANGE: ossev.l.cmd = MIDI_PGM_CHANGE; break; case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.cmd = MIDI_CHN_PRESSURE; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.cmd = MIDI_PITCH_BEND; break; default: return 0; /* not supported */ } ossev.v.dev = dev; switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: case SNDRV_SEQ_EVENT_NOTEOFF: case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.code = EV_CHN_VOICE; ossev.v.note = ev->data.note.note; ossev.v.parm = ev->data.note.velocity; ossev.v.chn = ev->data.note.channel; break; case SNDRV_SEQ_EVENT_CONTROLLER: case SNDRV_SEQ_EVENT_PGMCHANGE: case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.code = EV_CHN_COMMON; ossev.l.p1 = ev->data.control.param; ossev.l.val = ev->data.control.value; ossev.l.chn = ev->data.control.channel; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.code = EV_CHN_COMMON; ossev.l.val = ev->data.control.value + 8192; ossev.l.chn = ev->data.control.channel; break; } snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); snd_seq_oss_readq_put_event(dp->readq, &ossev); return 0; } /* * decode event and send MIDI bytes to read queue */ static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev) { char msg[32]; int len; snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); if (!dp->timer->running) len = snd_seq_oss_timer_start(dp->timer); if (ev->type == SNDRV_SEQ_EVENT_SYSEX) { snd_seq_oss_readq_sysex(dp->readq, mdev->seq_device, ev); snd_midi_event_reset_decode(mdev->coder); } else { len = snd_midi_event_decode(mdev->coder, msg, sizeof(msg), ev); if (len > 0) snd_seq_oss_readq_puts(dp->readq, mdev->seq_device, msg, len); } return 0; } /* * dump midi data * return 0 : enqueued * non-zero : invalid - ignored */ int snd_seq_oss_midi_putc(struct seq_oss_devinfo *dp, int dev, unsigned char c, struct snd_seq_event *ev) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; if (snd_midi_event_encode_byte(mdev->coder, c, ev)) { snd_seq_oss_fill_addr(dp, ev, mdev->client, mdev->port); snd_use_lock_free(&mdev->use_lock); return 0; } snd_use_lock_free(&mdev->use_lock); return -EINVAL; } /* * create OSS compatible midi_info record */ int snd_seq_oss_midi_make_info(struct seq_oss_devinfo *dp, int dev, struct midi_info *inf) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return -ENXIO; inf->device = dev; inf->dev_type = 0; /* FIXME: ?? */ inf->capabilities = 0; /* FIXME: ?? */ strscpy(inf->name, mdev->name, sizeof(inf->name)); snd_use_lock_free(&mdev->use_lock); return 0; } #ifdef CONFIG_SND_PROC_FS /* * proc interface */ static char * capmode_str(int val) { val &= PERM_READ|PERM_WRITE; if (val == (PERM_READ|PERM_WRITE)) return "read/write"; else if (val == PERM_READ) return "read"; else if (val == PERM_WRITE) return "write"; else return "none"; } void snd_seq_oss_midi_info_read(struct snd_info_buffer *buf) { int i; struct seq_oss_midi *mdev; snd_iprintf(buf, "\nNumber of MIDI devices: %d\n", max_midi_devs); for (i = 0; i < max_midi_devs; i++) { snd_iprintf(buf, "\nmidi %d: ", i); mdev = get_mdev(i); if (mdev == NULL) { snd_iprintf(buf, "*empty*\n"); continue; } snd_iprintf(buf, "[%s] ALSA port %d:%d\n", mdev->name, mdev->client, mdev->port); snd_iprintf(buf, " capability %s / opened %s\n", capmode_str(mdev->flags), capmode_str(mdev->opened)); snd_use_lock_free(&mdev->use_lock); } } #endif /* CONFIG_SND_PROC_FS */ |
7 7 7 7 7 7 7 7 3 31 31 31 31 30 3 1 5 4 4 3 8 45 46 7 6 5 20 18 18 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/array_size.h> #include <linux/atomic.h> #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/crc32c.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kobject.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> #include <net/dsfield.h> #include <net/genetlink.h> #include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_iv_ogm.h" #include "bat_v.h" #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" #include "multicast.h" #include "netlink.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" #include "tp_meter.h" #include "translation-table.h" /* List manipulations on hardif_list have to be rtnl_lock()'ed, * list traversals just rcu-locked */ struct list_head batadv_hardif_list; unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct workqueue_struct *batadv_event_workqueue; static void batadv_recv_handler_init(void); #define BATADV_UEV_TYPE_VAR "BATTYPE=" #define BATADV_UEV_ACTION_VAR "BATACTION=" #define BATADV_UEV_DATA_VAR "BATDATA=" static char *batadv_uev_action_str[] = { "add", "del", "change", "loopdetect", }; static char *batadv_uev_type_str[] = { "gw", "bla", }; static int __init batadv_init(void) { int ret; ret = batadv_tt_cache_init(); if (ret < 0) return ret; INIT_LIST_HEAD(&batadv_hardif_list); batadv_algo_init(); batadv_recv_handler_init(); batadv_v_init(); batadv_iv_init(); batadv_nc_init(); batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); if (!batadv_event_workqueue) goto err_create_wq; register_netdevice_notifier(&batadv_hard_if_notifier); rtnl_link_register(&batadv_link_ops); batadv_netlink_register(); pr_info("B.A.T.M.A.N. advanced %s (compatibility version %i) loaded\n", BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); return 0; err_create_wq: batadv_tt_cache_destroy(); return -ENOMEM; } static void __exit batadv_exit(void) { batadv_netlink_unregister(); rtnl_link_unregister(&batadv_link_ops); unregister_netdevice_notifier(&batadv_hard_if_notifier); destroy_workqueue(batadv_event_workqueue); batadv_event_workqueue = NULL; rcu_barrier(); batadv_tt_cache_destroy(); } /** * batadv_mesh_init() - Initialize mesh interface * @mesh_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_mesh_init(struct net_device *mesh_iface) { struct batadv_priv *bat_priv = netdev_priv(mesh_iface); int ret; spin_lock_init(&bat_priv->forw_bat_list_lock); spin_lock_init(&bat_priv->forw_bcast_list_lock); spin_lock_init(&bat_priv->tt.changes_list_lock); spin_lock_init(&bat_priv->tt.req_list_lock); spin_lock_init(&bat_priv->tt.roam_list_lock); spin_lock_init(&bat_priv->tt.last_changeset_lock); spin_lock_init(&bat_priv->tt.commit_lock); spin_lock_init(&bat_priv->gw.list_lock); #ifdef CONFIG_BATMAN_ADV_MCAST spin_lock_init(&bat_priv->mcast.mla_lock); spin_lock_init(&bat_priv->mcast.want_lists_lock); #endif spin_lock_init(&bat_priv->tvlv.container_list_lock); spin_lock_init(&bat_priv->tvlv.handler_list_lock); spin_lock_init(&bat_priv->meshif_vlan_list_lock); spin_lock_init(&bat_priv->tp_list_lock); INIT_HLIST_HEAD(&bat_priv->forw_bat_list); INIT_HLIST_HEAD(&bat_priv->forw_bcast_list); INIT_HLIST_HEAD(&bat_priv->gw.gateway_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.want_all_unsnoopables_list); INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv4_list); INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list); #endif INIT_LIST_HEAD(&bat_priv->tt.changes_list); INIT_HLIST_HEAD(&bat_priv->tt.req_list); INIT_LIST_HEAD(&bat_priv->tt.roam_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.mla_list); #endif INIT_HLIST_HEAD(&bat_priv->tvlv.container_list); INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); INIT_HLIST_HEAD(&bat_priv->meshif_vlan_list); INIT_HLIST_HEAD(&bat_priv->tp_list); bat_priv->gw.generation = 0; ret = batadv_originator_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_orig; } ret = batadv_tt_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_tt; } ret = batadv_v_mesh_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_v; } ret = batadv_bla_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_bla; } ret = batadv_dat_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_dat; } ret = batadv_nc_mesh_init(bat_priv); if (ret < 0) { atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); goto err_nc; } batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); atomic_set(&bat_priv->gw.reselect, 0); atomic_set(&bat_priv->mesh_state, BATADV_MESH_ACTIVE); return 0; err_nc: batadv_dat_free(bat_priv); err_dat: batadv_bla_free(bat_priv); err_bla: batadv_v_mesh_free(bat_priv); err_v: batadv_tt_free(bat_priv); err_tt: batadv_originator_free(bat_priv); err_orig: batadv_purge_outstanding_packets(bat_priv, NULL); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); return ret; } /** * batadv_mesh_free() - Deinitialize mesh interface * @mesh_iface: netdev struct of the mesh interface */ void batadv_mesh_free(struct net_device *mesh_iface) { struct batadv_priv *bat_priv = netdev_priv(mesh_iface); atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); batadv_purge_outstanding_packets(bat_priv, NULL); batadv_gw_node_free(bat_priv); batadv_v_mesh_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); batadv_mcast_free(bat_priv); /* Free the TT and the originator tables only after having terminated * all the other depending components which may use these structures for * their purposes. */ batadv_tt_free(bat_priv); /* Since the originator table clean up routine is accessing the TT * tables as well, it has to be invoked after the TT tables have been * freed and marked as empty. This ensures that no cleanup RCU callbacks * accessing the TT data are scheduled for later execution. */ batadv_originator_free(bat_priv); batadv_gw_free(bat_priv); free_percpu(bat_priv->bat_counters); bat_priv->bat_counters = NULL; atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); } /** * batadv_is_my_mac() - check if the given mac address belongs to any of the * real interfaces in the current mesh * @bat_priv: the bat priv with all the mesh interface information * @addr: the address to check * * Return: 'true' if the mac address was found, false otherwise. */ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { const struct batadv_hard_iface *hard_iface; bool is_my_mac = false; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { is_my_mac = true; break; } } rcu_read_unlock(); return is_my_mac; } /** * batadv_max_header_len() - calculate maximum encapsulation overhead for a * payload packet * * Return: the maximum encapsulation overhead in bytes. */ int batadv_max_header_len(void) { int header_len = 0; header_len = max_t(int, header_len, sizeof(struct batadv_unicast_packet)); header_len = max_t(int, header_len, sizeof(struct batadv_unicast_4addr_packet)); header_len = max_t(int, header_len, sizeof(struct batadv_bcast_packet)); #ifdef CONFIG_BATMAN_ADV_NC header_len = max_t(int, header_len, sizeof(struct batadv_coded_packet)); #endif return header_len + ETH_HLEN; } /** * batadv_skb_set_priority() - sets skb priority according to packet content * @skb: the packet to be sent * @offset: offset to the packet content * * This function sets a value between 256 and 263 (802.1d priority), which * can be interpreted by the cfg80211 or other drivers. */ void batadv_skb_set_priority(struct sk_buff *skb, int offset) { struct iphdr ip_hdr_tmp, *ip_hdr; struct ipv6hdr ip6_hdr_tmp, *ip6_hdr; struct ethhdr ethhdr_tmp, *ethhdr; struct vlan_ethhdr *vhdr, vhdr_tmp; u32 prio; /* already set, do nothing */ if (skb->priority >= 256 && skb->priority <= 263) return; ethhdr = skb_header_pointer(skb, offset, sizeof(*ethhdr), ðhdr_tmp); if (!ethhdr) return; switch (ethhdr->h_proto) { case htons(ETH_P_8021Q): vhdr = skb_header_pointer(skb, offset + sizeof(*vhdr), sizeof(*vhdr), &vhdr_tmp); if (!vhdr) return; prio = ntohs(vhdr->h_vlan_TCI) & VLAN_PRIO_MASK; prio = prio >> VLAN_PRIO_SHIFT; break; case htons(ETH_P_IP): ip_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), sizeof(*ip_hdr), &ip_hdr_tmp); if (!ip_hdr) return; prio = (ipv4_get_dsfield(ip_hdr) & 0xfc) >> 5; break; case htons(ETH_P_IPV6): ip6_hdr = skb_header_pointer(skb, offset + sizeof(*ethhdr), sizeof(*ip6_hdr), &ip6_hdr_tmp); if (!ip6_hdr) return; prio = (ipv6_get_dsfield(ip6_hdr) & 0xfc) >> 5; break; default: return; } skb->priority = prio + 256; } static int batadv_recv_unhandled_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { kfree_skb(skb); return NET_RX_DROP; } /* incoming packets with the batman ethertype received on any active hard * interface */ /** * batadv_batman_skb_recv() - Handle incoming message from an hard interface * @skb: the received packet * @dev: the net device that the packet was received on * @ptype: packet type of incoming packet (ETH_P_BATMAN) * @orig_dev: the original receive net device (e.g. bonded device) * * Return: NET_RX_SUCCESS on success or NET_RX_DROP in case of failure */ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { struct batadv_priv *bat_priv; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *hard_iface; u8 idx; hard_iface = container_of(ptype, struct batadv_hard_iface, batman_adv_ptype); /* Prevent processing a packet received on an interface which is getting * shut down otherwise the packet may trigger de-reference errors * further down in the receive path. */ if (!kref_get_unless_zero(&hard_iface->refcount)) goto err_out; skb = skb_share_check(skb, GFP_ATOMIC); /* skb was released by skb_share_check() */ if (!skb) goto err_put; /* packet should hold at least type and version */ if (unlikely(!pskb_may_pull(skb, 2))) goto err_free; /* expect a valid ethernet header here. */ if (unlikely(skb->mac_len != ETH_HLEN || !skb_mac_header(skb))) goto err_free; if (!hard_iface->mesh_iface) goto err_free; bat_priv = netdev_priv(hard_iface->mesh_iface); if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto err_free; /* discard frames on not active interfaces */ if (hard_iface->if_status != BATADV_IF_ACTIVE) goto err_free; batadv_ogm_packet = (struct batadv_ogm_packet *)skb->data; if (batadv_ogm_packet->version != BATADV_COMPAT_VERSION) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: incompatible batman version (%i)\n", batadv_ogm_packet->version); goto err_free; } /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); idx = batadv_ogm_packet->packet_type; (*batadv_rx_handler[idx])(skb, hard_iface); batadv_hardif_put(hard_iface); /* return NET_RX_SUCCESS in any case as we * most probably dropped the packet for * routing-logical reasons. */ return NET_RX_SUCCESS; err_free: kfree_skb(skb); err_put: batadv_hardif_put(hard_iface); err_out: return NET_RX_DROP; } static void batadv_recv_handler_init(void) { int i; for (i = 0; i < ARRAY_SIZE(batadv_rx_handler); i++) batadv_rx_handler[i] = batadv_recv_unhandled_packet; for (i = BATADV_UNICAST_MIN; i <= BATADV_UNICAST_MAX; i++) batadv_rx_handler[i] = batadv_recv_unhandled_unicast_packet; /* compile time checks for sizes */ BUILD_BUG_ON(sizeof(struct batadv_bla_claim_dst) != 6); BUILD_BUG_ON(sizeof(struct batadv_ogm_packet) != 24); BUILD_BUG_ON(sizeof(struct batadv_icmp_header) != 20); BUILD_BUG_ON(sizeof(struct batadv_icmp_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_icmp_packet_rr) != 116); BUILD_BUG_ON(sizeof(struct batadv_unicast_packet) != 10); BUILD_BUG_ON(sizeof(struct batadv_unicast_4addr_packet) != 18); BUILD_BUG_ON(sizeof(struct batadv_frag_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_bcast_packet) != 14); BUILD_BUG_ON(sizeof(struct batadv_coded_packet) != 46); BUILD_BUG_ON(sizeof(struct batadv_unicast_tvlv_packet) != 20); BUILD_BUG_ON(sizeof(struct batadv_tvlv_hdr) != 4); BUILD_BUG_ON(sizeof(struct batadv_tvlv_gateway_data) != 8); BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_vlan_data) != 8); BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12); BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8); i = sizeof_field(struct sk_buff, cb); BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i); /* broadcast packet */ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet; /* multicast packet */ batadv_rx_handler[BATADV_MCAST] = batadv_recv_mcast_packet; /* unicast packets ... */ /* unicast with 4 addresses packet */ batadv_rx_handler[BATADV_UNICAST_4ADDR] = batadv_recv_unicast_packet; /* unicast packet */ batadv_rx_handler[BATADV_UNICAST] = batadv_recv_unicast_packet; /* unicast tvlv packet */ batadv_rx_handler[BATADV_UNICAST_TVLV] = batadv_recv_unicast_tvlv; /* batman icmp packet */ batadv_rx_handler[BATADV_ICMP] = batadv_recv_icmp_packet; /* Fragmented packets */ batadv_rx_handler[BATADV_UNICAST_FRAG] = batadv_recv_frag_packet; } /** * batadv_recv_handler_register() - Register handler for batman-adv packet type * @packet_type: batadv_packettype which should be handled * @recv_handler: receive handler for the packet type * * Return: 0 on success or negative error number in case of failure */ int batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { int (*curr)(struct sk_buff *skb, struct batadv_hard_iface *recv_if); curr = batadv_rx_handler[packet_type]; if (curr != batadv_recv_unhandled_packet && curr != batadv_recv_unhandled_unicast_packet) return -EBUSY; batadv_rx_handler[packet_type] = recv_handler; return 0; } /** * batadv_recv_handler_unregister() - Unregister handler for packet type * @packet_type: batadv_packettype which should no longer be handled */ void batadv_recv_handler_unregister(u8 packet_type) { batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } /** * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in * the header * @skb: skb pointing to fragmented socket buffers * @payload_ptr: Pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed * * payload_ptr must always point to an address in the skb head buffer and not to * a fragment. * * Return: big endian crc32c of the checksummed data */ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) { u32 crc = 0; unsigned int from; unsigned int to = skb->len; struct skb_seq_state st; const u8 *data; unsigned int len; unsigned int consumed = 0; from = (unsigned int)(payload_ptr - skb->data); skb_prepare_seq_read(skb, from, to, &st); while ((len = skb_seq_read(consumed, &data, &st)) != 0) { crc = crc32c(crc, data, len); consumed += len; } return htonl(crc); } /** * batadv_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header * * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the * skb is vlan tagged. Otherwise BATADV_NO_FLAGS. */ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) { struct ethhdr *ethhdr = (struct ethhdr *)(skb->data + header_len); struct vlan_ethhdr *vhdr; unsigned short vid; if (ethhdr->h_proto != htons(ETH_P_8021Q)) return BATADV_NO_FLAGS; if (!pskb_may_pull(skb, header_len + VLAN_ETH_HLEN)) return BATADV_NO_FLAGS; vhdr = (struct vlan_ethhdr *)(skb->data + header_len); vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; /* VID 0 is only used to indicate "priority tag" frames which only * contain priority information and no VID. */ if (vid == 0) return BATADV_NO_FLAGS; vid |= BATADV_VLAN_HAS_TAG; return vid; } /** * batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * * Return: true if AP isolation is on for the VLAN identified by vid, false * otherwise */ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) { bool ap_isolation_enabled = false; struct batadv_meshif_vlan *vlan; /* if the AP isolation is requested on a VLAN, then check for its * setting in the proper VLAN private data structure */ vlan = batadv_meshif_vlan_get(bat_priv, vid); if (vlan) { ap_isolation_enabled = atomic_read(&vlan->ap_isolation); batadv_meshif_vlan_put(vlan); } return ap_isolation_enabled; } /** * batadv_throw_uevent() - Send an uevent with batman-adv specific env data * @bat_priv: the bat priv with all the mesh interface information * @type: subsystem type of event. Stored in uevent's BATTYPE * @action: action type of event. Stored in uevent's BATACTION * @data: string with additional information to the event (ignored for * BATADV_UEV_DEL). Stored in uevent's BATDATA * * Return: 0 on success or negative error number in case of failure */ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data) { int ret = -ENOMEM; struct kobject *bat_kobj; char *uevent_env[4] = { NULL, NULL, NULL, NULL }; bat_kobj = &bat_priv->mesh_iface->dev.kobj; uevent_env[0] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_TYPE_VAR, batadv_uev_type_str[type]); if (!uevent_env[0]) goto report_error; uevent_env[1] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_ACTION_VAR, batadv_uev_action_str[action]); if (!uevent_env[1]) goto free_first_env; /* If the event is DEL, ignore the data field */ if (action != BATADV_UEV_DEL) { uevent_env[2] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_DATA_VAR, data); if (!uevent_env[2]) goto free_second_env; } ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); kfree(uevent_env[2]); free_second_env: kfree(uevent_env[1]); free_first_env: kfree(uevent_env[0]); if (ret) report_error: batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", batadv_uev_type_str[type], batadv_uev_action_str[action], (action == BATADV_UEV_DEL ? "NULL" : data), ret); return ret; } module_init(batadv_init); module_exit(batadv_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR(BATADV_DRIVER_AUTHOR); MODULE_DESCRIPTION(BATADV_DRIVER_DESC); MODULE_VERSION(BATADV_SOURCE_VERSION); MODULE_ALIAS_RTNL_LINK("batadv"); MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME); |
158 159 75 75 75 75 75 74 75 75 4 75 75 75 75 86 148 148 5 158 164 163 164 98 98 2 98 98 86 98 133 162 127 127 128 128 128 128 128 128 127 95 95 95 95 1 94 94 158 131 94 94 158 81 5 146 81 5 5 5 5 146 146 162 164 130 3 1 1 164 164 163 131 132 132 107 130 1 131 112 64 37 164 162 158 164 146 164 2 164 164 158 1 158 131 132 132 158 159 159 1 159 164 159 137 11 11 11 11 146 7 70 2 2 17 16 11 3 2 1 3 3 1 3 2 2 2 2 1 1 3 12 1 1 1 1 1 1 1 11 1 1 1 11 1 12 3 55 55 55 55 158 105 105 135 11 132 131 132 121 124 123 123 123 124 122 111 122 122 124 123 4 124 124 123 124 4 100 100 100 100 100 100 100 100 100 100 100 100 100 98 98 36 36 36 36 36 36 75 75 74 74 75 75 75 75 74 75 75 75 75 74 75 75 75 75 75 75 75 75 75 74 74 74 75 75 74 75 75 23 23 23 23 23 23 16 16 16 23 22 75 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 2 1 3 3 3 3 3 3 3 75 3 3 75 75 12 63 75 73 1 1 1 1 1 8 1 7 1 75 8 75 8 74 3 3 3 75 75 75 75 75 74 75 75 1 1 1 74 2 2 2 2 75 75 74 75 75 75 1 73 75 74 75 3 74 75 74 75 74 74 75 75 75 75 75 75 75 75 22 22 75 74 75 4 75 75 71 73 75 75 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> #include <linux/timekeeping.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "locking.h" #include "tree-log.h" #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" #include "scrub.h" static struct kmem_cache *btrfs_trans_handle_cachep; /* * Transaction states and transitions * * No running transaction (fs tree blocks are not modified) * | * | To next stage: * | Call start_transaction() variants. Except btrfs_join_transaction_nostart(). * V * Transaction N [[TRANS_STATE_RUNNING]] * | * | New trans handles can be attached to transaction N by calling all * | start_transaction() variants. * | * | To next stage: * | Call btrfs_commit_transaction() on any trans handle attached to * | transaction N * V * Transaction N [[TRANS_STATE_COMMIT_PREP]] * | * | If there are simultaneous calls to btrfs_commit_transaction() one will win * | the race and the rest will wait for the winner to commit the transaction. * | * | The winner will wait for previous running transaction to completely finish * | if there is one. * | * Transaction N [[TRANS_STATE_COMMIT_START]] * | * | Then one of the following happens: * | - Wait for all other trans handle holders to release. * | The btrfs_commit_transaction() caller will do the commit work. * | - Wait for current transaction to be committed by others. * | Other btrfs_commit_transaction() caller will do the commit work. * | * | At this stage, only btrfs_join_transaction*() variants can attach * | to this running transaction. * | All other variants will wait for current one to finish and attach to * | transaction N+1. * | * | To next stage: * | Caller is chosen to commit transaction N, and all other trans handle * | haven been released. * V * Transaction N [[TRANS_STATE_COMMIT_DOING]] * | * | The heavy lifting transaction work is started. * | From running delayed refs (modifying extent tree) to creating pending * | snapshots, running qgroups. * | In short, modify supporting trees to reflect modifications of subvolume * | trees. * | * | At this stage, all start_transaction() calls will wait for this * | transaction to finish and attach to transaction N+1. * | * | To next stage: * | Until all supporting trees are updated. * V * Transaction N [[TRANS_STATE_UNBLOCKED]] * | Transaction N+1 * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]] * | need to write them back to disk and update | * | super blocks. | * | | * | At this stage, new transaction is allowed to | * | start. | * | All new start_transaction() calls will be | * | attached to transid N+1. | * | | * | To next stage: | * | Until all tree blocks are super blocks are | * | written to block devices | * V | * Transaction N [[TRANS_STATE_COMPLETED]] V * All tree blocks and super blocks are written. Transaction N+1 * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]] * data structures will be cleaned up. | Life goes on */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_COMMIT_PREP] = 0U, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOSTART), [TRANS_STATE_UNBLOCKED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), }; void btrfs_put_transaction(struct btrfs_transaction *transaction) { WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", transaction->delayed_refs.pending_csums); /* * If any block groups are found in ->deleted_bgs then it's * because the transaction was aborted and a commit did not * happen (things failed before writing the new superblock * and calling btrfs_finish_extent_commit()), so we can not * discard the physical locations of the block groups. */ while (!list_empty(&transaction->deleted_bgs)) { struct btrfs_block_group *cache; cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); /* * Not strictly necessary to lock, as no other task will be using a * block_group on the deleted_bgs list during a transaction abort. */ spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } WARN_ON(!list_empty(&transaction->dev_update_list)); kfree(transaction); } } static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; /* * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); down_write(&fs_info->commit_root_sem); if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) fs_info->last_reloc_trans = trans->transid; list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } /* We can free old roots now. */ spin_lock(&cur_trans->dropped_roots_lock); while (!list_empty(&cur_trans->dropped_roots)) { root = list_first_entry(&cur_trans->dropped_roots, struct btrfs_root, root_list); list_del_init(&root->root_list); spin_unlock(&cur_trans->dropped_roots_lock); btrfs_free_log(trans, root); btrfs_drop_and_free_fs_root(fs_info, root); spin_lock(&cur_trans->dropped_roots_lock); } spin_unlock(&cur_trans->dropped_roots_lock); up_write(&fs_info->commit_root_sem); } static inline void extwriter_counter_inc(struct btrfs_transaction *trans, unsigned int type) { if (type & TRANS_EXTWRITERS) atomic_inc(&trans->num_extwriters); } static inline void extwriter_counter_dec(struct btrfs_transaction *trans, unsigned int type) { if (type & TRANS_EXTWRITERS) atomic_dec(&trans->num_extwriters); } static inline void extwriter_counter_init(struct btrfs_transaction *trans, unsigned int type) { atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); } static inline int extwriter_counter_read(struct btrfs_transaction *trans) { return atomic_read(&trans->num_extwriters); } /* * To be called after doing the chunk btree updates right after allocating a new * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a * chunk after all chunk btree updates and after finishing the second phase of * chunk allocation (btrfs_create_pending_block_groups()) in case some block * group had its chunk item insertion delayed to the second phase. */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } /* * either allocate a new transaction or hop into the existing one */ static noinline int join_transaction(struct btrfs_fs_info *fs_info, unsigned int type) { struct btrfs_transaction *cur_trans; spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { const int abort_error = cur_trans->aborted; spin_unlock(&fs_info->trans_lock); return abort_error; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); return -EBUSY; } refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); return 0; } spin_unlock(&fs_info->trans_lock); /* * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the * current transaction, and commit it. If there is no transaction, just * return ENOENT. */ if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) return -ENOENT; /* * JOIN_NOLOCK only happens during the transaction commit, so * it is impossible that ->running_transaction is NULL */ BUG_ON(type == TRANS_JOIN_NOLOCK); cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); if (!cur_trans) return -ENOMEM; btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); spin_lock(&fs_info->trans_lock); if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the checks above */ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); goto loop; } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); return -EROFS; } cur_trans->fs_info = fs_info; atomic_set(&cur_trans->pending_ordered, 0); init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ refcount_set(&cur_trans->use_count, 2); cur_trans->flags = 0; cur_trans->start_time = ktime_get_seconds(); memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); xa_init(&cur_trans->delayed_refs.head_refs); xa_init(&cur_trans->delayed_refs.dirty_extents); /* * although the tree mod log is per file system and not per transaction, * the log must never go across transaction boundaries. */ smp_mb(); if (!list_empty(&fs_info->tree_mod_seq_list)) WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n"); if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->dev_update_list); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); INIT_LIST_HEAD(&cur_trans->dropped_roots); mutex_init(&cur_trans->cache_write_mutex); spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS); btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; spin_unlock(&fs_info->trans_lock); return 0; } /* * This does all the record keeping required to make sure that a shareable root * is properly recorded in a given transaction. This is required to make sure * the old root from before we joined the transaction is deleted when the * transaction commits. */ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, int force) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && btrfs_get_root_last_trans(root) < trans->transid) || force) { WARN_ON(!force && root->commit_root != root->node); /* * see below for IN_TRANS_SETUP usage rules * we have the reloc mutex held now, so there * is only one writer in this function */ set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); /* make sure readers find IN_TRANS_SETUP before * they find our root->last_trans update */ smp_wmb(); spin_lock(&fs_info->fs_roots_radix_lock); if (btrfs_get_root_last_trans(root) == trans->transid && !force) { spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } radix_tree_tag_set(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_set_root_last_trans(root, trans->transid); /* this is pretty tricky. We don't want to * take the relocation lock in btrfs_record_root_in_trans * unless we're really doing the first setup for this root in * this transaction. * * Normally we'd use root->last_trans as a flag to decide * if we want to take the expensive mutex. * * But, we have to set root->last_trans before we * init the relocation root, otherwise, we trip over warnings * in ctree.c. The solution used here is to flag ourselves * with root IN_TRANS_SETUP. When this is 1, we're still * fixing up the reloc trees and everyone must wait. * * When this is zero, they can trust root->last_trans and fly * through btrfs_record_root_in_trans without having to take the * lock. smp_wmb() makes sure that all the writes above are * done before we pop in the zero below */ ret = btrfs_init_reloc_root(trans, root); smp_mb__before_atomic(); clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } return ret; } void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; /* Add ourselves to the transaction dropped list */ spin_lock(&cur_trans->dropped_roots_lock); list_add_tail(&root->root_list, &cur_trans->dropped_roots); spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; /* * see record_root_in_trans for comments about IN_TRANS_SETUP usage * and barriers */ smp_rmb(); if (btrfs_get_root_last_trans(root) == trans->transid && !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; mutex_lock(&fs_info->reloc_mutex); ret = record_root_in_trans(trans, root, 0); mutex_unlock(&fs_info->reloc_mutex); return ret; } static inline int is_transaction_blocked(struct btrfs_transaction *trans) { return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && !TRANS_ABORTED(trans)); } /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. */ static void wait_current_trans(struct btrfs_fs_info *fs_info) { struct btrfs_transaction *cur_trans; spin_lock(&fs_info->trans_lock); cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || TRANS_ABORTED(cur_trans)); btrfs_put_transaction(cur_trans); } else { spin_unlock(&fs_info->trans_lock); } } static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) { if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return 0; if (type == TRANS_START) return 1; return 0; } static inline bool need_reserve_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; if (!fs_info->reloc_ctl || !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; return true; } static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush, u64 num_bytes, u64 *delayed_refs_bytes) { struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; u64 bytes = num_bytes + *delayed_refs_bytes; int ret; /* * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); /* * If we are an emergency flush, which can steal from the global block * reserve, then attempt to not reserve space for the delayed refs, as * we will consume space for them from the global block reserve. */ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); } return ret; } static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, unsigned int type, enum btrfs_reserve_flush_enum flush, bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; u64 qgroup_reserved = 0; u64 delayed_refs_bytes = 0; bool reloc_reserved = false; bool do_chunk_alloc = false; int ret; if (BTRFS_FS_ERROR(fs_info)) return ERR_PTR(-EROFS); if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; refcount_inc(&h->use_count); WARN_ON(refcount_read(&h->use_count) > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; } /* * Do the reservation before we join the transaction so we can do all * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { qgroup_reserved = num_items * fs_info->nodesize; /* * Use prealloc for now, as there might be a currently running * transaction that could free this reserved space prematurely * by committing. */ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, enforce_qgroups, false); if (ret) return ERR_PTR(ret); num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); /* * If we plan to insert/update/delete "num_items" from a btree, * we will also generate delayed refs for extent buffers in the * respective btree paths, so reserve space for the delayed refs * that will be generated by the caller as it modifies btrees. * Try to reserve them to avoid excessive use of the global * block reserve. */ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items); /* * Do the reservation for the relocation root creation */ if (need_reserve_reloc_root(root)) { num_bytes += fs_info->nodesize; reloc_reserved = true; } ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes, &delayed_refs_bytes); if (ret) goto reserve_fail; btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true); if (trans_rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !btrfs_block_rsv_full(delayed_refs_rsv)) { /* * Some people call with btrfs_start_transaction(root, 0) * because they can be throttled, but have some other mechanism * for reserving space. We still want these guys to refill the * delayed block_rsv so just add 1 items worth of reservation * here. */ ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); if (ret) goto reserve_fail; } again: h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) { ret = -ENOMEM; goto alloc_fail; } /* * If we are JOIN_NOLOCK we're already committing a transaction and * waiting on this guy, so we don't need to do the sb_start_intwrite * because we're already holding a ref. We need this because we could * have raced in and did an fsync() on a file which can kick a commit * and then we deadlock with somebody doing a freeze. * * If we are ATTACH, it means we just want to catch the current * transaction and commit it, so we needn't do sb_start_intwrite(). */ if (type & __TRANS_FREEZABLE) sb_start_intwrite(fs_info->sb); if (may_wait_transaction(fs_info, type)) wait_current_trans(fs_info); do { ret = join_transaction(fs_info, type); if (ret == -EBUSY) { wait_current_trans(fs_info); if (unlikely(type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)) ret = -ENOENT; } } while (ret == -EBUSY); if (ret < 0) goto join_fail; cur_trans = fs_info->running_transaction; h->transid = cur_trans->transid; h->transaction = cur_trans; refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; h->type = type; INIT_LIST_HEAD(&h->new_bgs); btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS); smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START && may_wait_transaction(fs_info, type)) { current->journal_info = h; btrfs_commit_transaction(h); goto again; } if (num_bytes) { trace_btrfs_space_reservation(fs_info, "transaction", h->transid, num_bytes, 1); h->block_rsv = trans_rsv; h->bytes_reserved = num_bytes; if (delayed_refs_bytes > 0) { trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", h->transid, delayed_refs_bytes, 1); h->delayed_refs_bytes_reserved = delayed_refs_bytes; btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true); delayed_refs_bytes = 0; } h->reloc_reserved = reloc_reserved; } got_it: if (!current->journal_info) current->journal_info = h; /* * If the space_info is marked ALLOC_FORCE then we'll get upgraded to * ALLOC_FORCE the first run through, and then we won't allocate for * anybody else who races in later. We don't care about the return * value here. */ if (do_chunk_alloc && num_bytes) { u64 flags = h->block_rsv->space_info->flags; btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), CHUNK_ALLOC_NO_FORCE); } /* * btrfs_record_root_in_trans() needs to alloc new extents, and may * call btrfs_join_transaction() while we're also starting a * transaction. * * Thus it need to be called after current->journal_info initialized, * or we can deadlock. */ ret = btrfs_record_root_in_trans(h, root); if (ret) { /* * The transaction handle is fully initialized and linked with * other structures so it needs to be ended in case of errors, * not just freed. */ btrfs_end_transaction(h); goto reserve_fail; } /* * Now that we have found a transaction to be a part of, convert the * qgroup reservation from prealloc to pertrans. A different transaction * can't race in and free our pertrans out from under us. */ if (qgroup_reserved) btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); return h; join_fail: if (type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); if (delayed_refs_bytes) btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL, true); } struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL_STEAL, false); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH, true); } struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK, BTRFS_RESERVE_NO_FLUSH, true); } /* * Similar to regular join but it never starts a transaction when none is * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. * This is similar to btrfs_attach_transaction() but it allows the join to * happen if the transaction commit already started but it's not yet in the * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). */ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOSTART, BTRFS_RESERVE_NO_FLUSH, true); } /* * Catch the running transaction. * * It is used when we want to commit the current the transaction, but * don't want to start a new one. * * Note: If this function return -ENOENT, it just means there is no * running transaction. But it is possible that the inactive transaction * is still in the memory, not fully on disk. If you hope there is no * inactive transaction in the fs when -ENOENT is returned, you should * invoke * btrfs_attach_transaction_barrier() */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); } /* * Catch the running transaction. * * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully * complete. */ struct btrfs_trans_handle * btrfs_attach_transaction_barrier(struct btrfs_root *root) { struct btrfs_trans_handle *trans; trans = start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); if (trans == ERR_PTR(-ENOENT)) { int ret; ret = btrfs_wait_for_commit(root->fs_info, 0); if (ret) return ERR_PTR(ret); } return trans; } /* Wait for a transaction commit to reach at least the given state. */ static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { struct btrfs_fs_info *fs_info = commit->fs_info; u64 transid = commit->transid; bool put = false; /* * At the moment this function is called with min_state either being * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED. */ if (min_state == TRANS_STATE_COMPLETED) btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); else btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); while (1) { wait_event(commit->commit_wait, commit->state >= min_state); if (put) btrfs_put_transaction(commit); if (min_state < TRANS_STATE_COMPLETED) break; /* * A transaction isn't really completed until all of the * previous transactions are completed, but with fsync we can * end up with SUPER_COMMITTED transactions before a COMPLETED * transaction. Wait for those. */ spin_lock(&fs_info->trans_lock); commit = list_first_entry_or_null(&fs_info->trans_list, struct btrfs_transaction, list); if (!commit || commit->transid > transid) { spin_unlock(&fs_info->trans_lock); break; } refcount_inc(&commit->use_count); put = true; spin_unlock(&fs_info->trans_lock); } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) { struct btrfs_transaction *cur_trans = NULL, *t; int ret = 0; if (transid) { if (transid <= btrfs_get_last_trans_committed(fs_info)) goto out; /* find specified transaction */ spin_lock(&fs_info->trans_lock); list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; refcount_inc(&cur_trans->use_count); ret = 0; break; } if (t->transid > transid) { ret = 0; break; } } spin_unlock(&fs_info->trans_lock); /* * The specified transaction doesn't exist, or we * raced with btrfs_commit_transaction */ if (!cur_trans) { if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; goto out; } } else { /* find newest transaction that is committing | committed */ spin_lock(&fs_info->trans_lock); list_for_each_entry_reverse(t, &fs_info->trans_list, list) { if (t->state >= TRANS_STATE_COMMIT_START) { if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; refcount_inc(&cur_trans->use_count); break; } } spin_unlock(&fs_info->trans_lock); if (!cur_trans) goto out; /* nothing committing|committed */ } wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); out: return ret; } void btrfs_throttle(struct btrfs_fs_info *fs_info) { wait_current_trans(fs_info); } bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; if (cur_trans->state >= TRANS_STATE_COMMIT_START || test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; if (btrfs_check_space_for_delayed_refs(trans->fs_info)) return true; return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50); } static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->block_rsv) { ASSERT(!trans->bytes_reserved); ASSERT(!trans->delayed_refs_bytes_reserved); return; } if (!trans->bytes_reserved) { ASSERT(!trans->delayed_refs_bytes_reserved); return; } ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, trans->bytes_reserved, NULL); trans->bytes_reserved = 0; if (!trans->delayed_refs_bytes_reserved) return; trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", trans->transid, trans->delayed_refs_bytes_reserved, 0); btrfs_block_rsv_release(fs_info, &trans->delayed_rsv, trans->delayed_refs_bytes_reserved, NULL); trans->delayed_refs_bytes_reserved = 0; } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int throttle) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); trans->block_rsv = trans->orig_rsv; return 0; } btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; btrfs_create_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); cond_wake_up(&cur_trans->writer_wait); btrfs_lockdep_release(info, btrfs_trans_num_extwriters); btrfs_lockdep_release(info, btrfs_trans_num_writers); btrfs_put_transaction(cur_trans); if (current->journal_info == trans) current->journal_info = NULL; if (throttle) btrfs_run_delayed_iputs(info); if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) ret = trans->aborted; else ret = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); return ret; } int btrfs_end_transaction(struct btrfs_trans_handle *trans) { return __btrfs_end_transaction(trans, 0); } int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) { return __btrfs_end_transaction(trans, 1); } /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of * those extents are sent to disk but does not wait on them */ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { int ret = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; while (find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; ret = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error * and wait for writeback of this range to finish - because we * failed to set the bit EXTENT_NEED_WAIT for the range, a call * to __btrfs_wait_marked_extents() would not know that * writeback for this range started and therefore wouldn't * wait for it to finish - we don't want to commit a * superblock that points to btree nodes/leafs for which * writeback hasn't finished yet (and without errors). * We cleanup any entries left in the io tree when committing * the transaction (through extent_io_tree_release()). */ if (ret == -ENOMEM) { ret = 0; wait_writeback = true; } if (!ret) ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && wait_writeback) ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); if (ret) break; cached_state = NULL; cond_resched(); start = end + 1; } return ret; } /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of * those extents are on disk for transaction or log commit. We wait * on all the pages and clear them from the dirty pages state tree */ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; int ret = 0; while (find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries * left in the io tree. For a log commit, we don't remove them * after committing the log because the tree can be accessed * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ ret = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, &cached_state); if (ret == -ENOMEM) ret = 0; if (!ret) ret = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); if (ret) break; cached_state = NULL; cond_resched(); start = end + 1; } return ret; } static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { bool errors = false; int err; err = __btrfs_wait_marked_extents(fs_info, dirty_pages); if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) errors = true; if (errors && !err) err = -EIO; return err; } int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) { struct btrfs_fs_info *fs_info = log_root->fs_info; struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; bool errors = false; int err; ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); err = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY) && test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) errors = true; if ((mark & EXTENT_NEW) && test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) errors = true; if (errors && !err) err = -EIO; return err; } /* * When btree blocks are allocated the corresponding extents are marked dirty. * This function ensures such extents are persisted on disk for transaction or * log commit. * * @trans: transaction whose dirty pages we'd like to write */ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) { int ret; int ret2; struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages; struct btrfs_fs_info *fs_info = trans->fs_info; struct blk_plug plug; blk_start_plug(&plug); ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY); blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; else if (ret2) return ret2; else return 0; } /* * this is used to update the root pointer in the tree of tree roots. * * But, in the case of the extent allocation tree, updating the root * pointer may allocate blocks which may change the root of the extent * allocation tree. * * So, this loops and repeats and makes sure the cowonly root didn't * change while the root pointer was being updated in the metadata. */ static int update_cowonly_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; u64 old_root_bytenr; u64 old_root_used; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; old_root_used = btrfs_root_used(&root->root_item); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); if (ret) return ret; old_root_used = btrfs_root_used(&root->root_item); } return 0; } /* * update all the cowonly tree roots on disk * * The error handling in this function may not be obvious. Any of the * failures will cause the file system to go offline. We still need * to clean up the delayed refs. */ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; struct extent_buffer *eb; int ret; /* * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret) return ret; ret = btrfs_run_dev_stats(trans); if (ret) return ret; ret = btrfs_run_dev_replace(trans); if (ret) return ret; ret = btrfs_run_qgroups(trans); if (ret) return ret; ret = btrfs_setup_space_cache(trans); if (ret) return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); list_add_tail(&root->dirty_list, &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; } /* Now flush any delayed refs generated by updating all of the roots */ ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; /* * We're writing the dirty block groups, which could generate * delayed refs, which could generate more dirty block groups, * so we want to keep this flushing in this loop to make sure * everything gets run. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; } if (!list_empty(&fs_info->dirty_cowonly_roots)) goto again; /* Update dev-replace pointer once everything is committed */ fs_info->dev_replace.committed_cursor_left = fs_info->dev_replace.cursor_left_last_write_of_item; return 0; } /* * If we had a pending drop we need to see if there are any others left in our * dead roots list, and if not clear our bit and wake any waiters. */ void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) { /* * We put the drop in progress roots at the front of the list, so if the * first entry doesn't have UNFINISHED_DROP set we can wake everybody * up. */ spin_lock(&fs_info->trans_lock); if (!list_empty(&fs_info->dead_roots)) { struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { spin_unlock(&fs_info->trans_lock); return; } } spin_unlock(&fs_info->trans_lock); btrfs_wake_unfinished_drop(fs_info); } /* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ void btrfs_add_dead_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); /* We want to process the partially complete drops first. */ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) list_add(&root->root_list, &fs_info->dead_roots); else list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } /* * Update each subvolume root and its relocation root, if it exists, in the tree * of tree roots. Also free log roots if they exist. */ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *gang[8]; int i; int ret; /* * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, (void **)gang, 0, ARRAY_SIZE(gang), BTRFS_ROOT_TRANS_TAG); if (ret == 0) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; int ret2; /* * At this point we can neither have tasks logging inodes * from a root nor trying to commit a log tree. */ ASSERT(atomic_read(&root->log_writers) == 0); ASSERT(atomic_read(&root->log_commit[0]) == 0); ASSERT(atomic_read(&root->log_commit[1]) == 0); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), BTRFS_ROOT_TRANS_TAG); btrfs_qgroup_free_meta_all_pertrans(root); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); ret2 = btrfs_update_reloc_root(trans, root); if (ret2) return ret2; /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_mb__after_atomic(); if (root->commit_root != root->node) { list_add_tail(&root->dirty_list, &trans->transaction->switch_commits); btrfs_set_root_node(&root->root_item, root->node); } ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); } } spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } /* * Do all special snapshot related qgroup dirty hack. * * Will do all needed qgroup inherit and dirty hack like switch commit * roots inside one transaction and write all btree into disk, to make * qgroup works. */ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *src, struct btrfs_root *parent, struct btrfs_qgroup_inherit *inherit, u64 dst_objectid) { struct btrfs_fs_info *fs_info = src->fs_info; int ret; /* * Save some performance in the case that qgroups are not enabled. If * this check races with the ioctl, rescan will kick in anyway. */ if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * Ensure dirty @src will be committed. Or, after coming * commit_fs_roots() and switch_commit_roots(), any dirty but not * recorded root will never be updated again, causing an outdated root * item. */ ret = record_root_in_trans(trans, src, 1); if (ret) return ret; /* * btrfs_qgroup_inherit relies on a consistent view of the usage for the * src root, so we must run the delayed refs here. * * However this isn't particularly fool proof, because there's no * synchronization keeping us from changing the tree after this point * before we do the qgroup_inherit, or even from making changes while * we're doing the qgroup_inherit. But that's a problem for the future, * for now flush the delayed refs to narrow the race window where the * qgroup counters could end up wrong. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = commit_fs_roots(trans); if (ret) goto out; ret = btrfs_qgroup_account_extents(trans); if (ret < 0) goto out; /* Now qgroup are all updated, we can inherit it to new qgroups */ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid, btrfs_root_id(parent), inherit); if (ret < 0) goto out; /* * Now we do a simplified commit transaction, which will: * 1) commit all subvolume and extent tree * To ensure all subvolume and extent tree have a valid * commit_root to accounting later insert_dir_item() * 2) write all btree blocks onto disk * This is to make sure later btree modification will be cowed * Or commit_root can be populated and cause wrong qgroup numbers * In this simplified commit, we don't really care about other trees * like chunk and root tree, as they won't affect qgroup. * And we don't write super to avoid half committed status. */ ret = commit_cowonly_roots(trans); if (ret) goto out; switch_commit_roots(trans); ret = btrfs_write_and_wait_transaction(trans); if (ret) btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction for qgroup"); out: /* * Force parent root to be updated, as we recorded it before so its * last_trans == cur_transid. * Or it won't be committed again onto disk after later * insert_dir_item() */ if (!ret) ret = record_root_in_trans(trans, parent, 1); return ret; } /* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. * * Note: * If the error which may affect the commitment of the current transaction * happens, we should return the error number. If the error which just affect * the creation of the pending snapshots, just return 0. */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key key; struct btrfs_root_item *new_root_item; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct btrfs_inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; struct extent_buffer *old; struct timespec64 cur_time; int ret = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; u64 root_flags; unsigned int nofs_flags; struct fscrypt_name fname; ASSERT(pending->path); path = pending->path; ASSERT(pending->root_item); new_root_item = pending->root_item; /* * We're inside a transaction and must make sure that any potential * allocations with GFP_KERNEL in fscrypt won't recurse back to * filesystem. */ nofs_flags = memalloc_nofs_save(); pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); if (pending->error) goto free_pending; pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) goto free_fname; /* * Make qgroup to skip current new snapshot's qgroupid, as it is * accounted by later btrfs_qgroup_inherit(). */ btrfs_set_skip_qgroup(trans, objectid); btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { pending->error = btrfs_block_rsv_add(fs_info, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) goto clear_skip_qgroup; } key.objectid = objectid; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; cur_time = current_time(&parent_inode->vfs_inode); /* * insert the directory item */ ret = btrfs_set_inode_index(parent_inode, &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(parent_inode), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); btrfs_abort_transaction(trans, ret); goto fail; } btrfs_release_path(path); ret = btrfs_create_qgroup(trans, objectid); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); goto fail; } /* * pull in the delayed directory update * and the delayed inode item * otherwise we corrupt the FS during * snapshot */ ret = btrfs_run_delayed_items(trans); if (ret) { /* Transaction aborted */ btrfs_abort_transaction(trans, ret); goto fail; } ret = record_root_in_trans(trans, root, 0); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); root_flags = btrfs_root_flags(new_root_item); if (pending->readonly) root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; else root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; btrfs_set_root_flags(new_root_item, root_flags); btrfs_set_root_generation_v2(new_root_item, trans->transid); generate_random_guid(new_root_item->uuid); memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { memset(new_root_item->received_uuid, 0, sizeof(new_root_item->received_uuid)); memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); btrfs_set_root_stransid(new_root_item, 0); btrfs_set_root_rtransid(new_root_item, 0); } btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_copy_root(trans, root, old, &tmp, objectid); /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* * insert root back/forward references */ ret = btrfs_add_root_ref(trans, objectid, btrfs_root_id(parent_root), btrfs_ino(parent_inode), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } key.offset = (u64)-1; pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); pending->snap = NULL; btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* * Do special qgroup accounting for snapshot, as we do some qgroup * snapshot hack to do fast snapshot. * To co-operate with that hack, we do hack again. * Or snapshot will be greatly slowed down by a subtree qgroup rescan */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) ret = qgroup_account_snapshot(trans, root, parent_root, pending->inherit, objectid); else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid, btrfs_root_id(parent_root), pending->inherit); if (ret < 0) goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, parent_inode, &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + fname.disk_name.len * 2); inode_set_mtime_to_ts(&parent_inode->vfs_inode, inode_set_ctime_current(&parent_inode->vfs_inode)); ret = btrfs_update_inode_fallback(trans, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); goto fail; } } fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); free_fname: fscrypt_free_filename(&fname); free_pending: kfree(new_root_item); pending->root_item = NULL; btrfs_free_path(path); pending->path = NULL; return ret; } /* * create all the snapshots we've scheduled for creation */ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans) { struct btrfs_pending_snapshot *pending, *next; struct list_head *head = &trans->transaction->pending_snapshots; int ret = 0; list_for_each_entry_safe(pending, next, head, list) { list_del(&pending->list); ret = create_pending_snapshot(trans, pending); if (ret) break; } return ret; } static void update_super_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root_item *root_item; struct btrfs_super_block *super; super = fs_info->super_copy; root_item = &fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; super->chunk_root_generation = root_item->generation; super->chunk_root_level = root_item->level; root_item = &fs_info->tree_root->root_item; super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; if (btrfs_test_opt(fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags)) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; int ret = 0; spin_lock(&info->trans_lock); trans = info->running_transaction; if (trans) ret = is_transaction_blocked(trans); spin_unlock(&info->trans_lock); return ret; } void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans; /* Kick the transaction kthread. */ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); /* take transaction reference */ cur_trans = trans->transaction; refcount_inc(&cur_trans->use_count); btrfs_end_transaction(trans); /* * Wait for the current transaction commit to start and block * subsequent transaction joins */ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); btrfs_put_transaction(cur_trans); } /* * If there is a running transaction commit it or if it's already committing, * wait for its commit to complete. Does not start and commit a new transaction * if there isn't any running. */ int btrfs_commit_current_transaction(struct btrfs_root *root) { struct btrfs_trans_handle *trans; trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { int ret = PTR_ERR(trans); return (ret == -ENOENT) ? 0 : ret; } return btrfs_commit_transaction(trans); } static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; WARN_ON(refcount_read(&trans->use_count) > 1); btrfs_abort_transaction(trans, err); spin_lock(&fs_info->trans_lock); /* * If the transaction is removed from the list, it means this * transaction has been committed successfully, so it is impossible * to call the cleanup function. */ BUG_ON(list_empty(&cur_trans->list)); if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); /* * The thread has already released the lockdep map as reader * already in btrfs_commit_transaction(). */ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); spin_lock(&fs_info->trans_lock); } /* * Now that we know no one else is still using the transaction we can * remove the transaction from the list of transactions. This avoids * the transaction kthread from cleaning up the transaction while some * other task is still using it, which could result in a use-after-free * on things like log trees, as it forces the transaction kthread to * wait for this transaction to be cleaned up by us. */ list_del_init(&cur_trans->list); spin_unlock(&fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction); spin_lock(&fs_info->trans_lock); if (cur_trans == fs_info->running_transaction) fs_info->running_transaction = NULL; spin_unlock(&fs_info->trans_lock); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); trace_btrfs_transaction_commit(fs_info); if (current->journal_info == trans) current->journal_info = NULL; /* * If relocation is running, we can't cancel scrub because that will * result in a deadlock. Before relocating a block group, relocation * pauses scrub, then starts and commits a transaction before unpausing * scrub. If the transaction commit is being done by the relocation * task or triggered by another task and the relocation task is waiting * for the commit, and we end up here due to an error in the commit * path, then calling btrfs_scrub_cancel() will deadlock, as we are * asking for scrub to stop while having it asked to be paused higher * above in relocation code. */ if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) btrfs_scrub_cancel(fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } /* * Release reserved delayed ref space of all pending block groups of the * transaction and remove them from the list */ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); /* * Not strictly necessary to lock, as no other task will be using a * block_group on the new_bgs list during a transaction abort. */ spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); btrfs_put_block_group(block_group); spin_unlock(&fs_info->unused_bgs_lock); } } static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* * We use try_to_writeback_inodes_sb() here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. * Currently are holding the fs freeze lock, if we do an async flush * we'll do btrfs_join_transaction() and deadlock because we need to * wait for the fs freeze lock. Using the direct flushing we benefit * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. * * Note that try_to_writeback_inodes_sb() will only trigger writeback * if it can read lock sb->s_umount. It will always be able to lock it, * except when the filesystem is being unmounted or being frozen, but in * those cases sync_filesystem() is called, which results in calling * writeback_inodes_sb() while holding a write lock on sb->s_umount. * Note that we don't call writeback_inodes_sb() directly, because it * will emit a warning if sb->s_umount is not locked. */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL); } /* * Add a pending snapshot associated with the given transaction handle to the * respective handle. This must be called after the transaction commit started * and while holding fs_info->trans_lock. * This serves to guarantee a caller of btrfs_commit_transaction() that it can * safely free the pending snapshot pointer in case btrfs_commit_transaction() * returns an error. */ static void add_pending_snapshot(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; if (!trans->pending_snapshot) return; lockdep_assert_held(&trans->fs_info->trans_lock); ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) { fs_info->commit_stats.commit_count++; fs_info->commit_stats.last_commit_dur = interval; fs_info->commit_stats.max_commit_dur = max_t(u64, fs_info->commit_stats.max_commit_dur, interval); fs_info->commit_stats.total_commit_dur += interval; } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; ktime_t start_time; ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto lockdep_trans_commit_start_release; } btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; /* * We only want one transaction commit doing the flushing so we do not * waste a bunch of time on lock contention on the extent root node. */ if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) { /* * Make a pass through all the delayed refs we have so far. * Any running threads may add more while we are here. */ ret = btrfs_run_delayed_refs(trans, 0); if (ret) goto lockdep_trans_commit_start_release; } btrfs_create_pending_block_groups(trans); if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; /* this mutex is also taken before trying to set * block groups readonly. We need to make sure * that nobody has set a block group readonly * after a extents from that block group have been * allocated for cache files. btrfs_set_block_group_ro * will wait for the transaction to commit if it * finds BTRFS_TRANS_DIRTY_BG_RUN set. * * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure * only one process starts all the block group IO. It wouldn't * hurt to have more than one go through, but there's no * real advantage to it either. */ mutex_lock(&fs_info->ro_block_group_mutex); if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) run_it = 1; mutex_unlock(&fs_info->ro_block_group_mutex); if (run_it) { ret = btrfs_start_dirty_block_groups(trans); if (ret) goto lockdep_trans_commit_start_release; } } spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; add_pending_snapshot(trans); spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); return ret; } cur_trans->state = TRANS_STATE_COMMIT_PREP; wake_up(&fs_info->transaction_blocked_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans, want_state); ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); if (ret) goto lockdep_release; spin_lock(&fs_info->trans_lock); } } else { /* * The previous transaction was aborted and was already removed * from the list of transactions at fs_info->trans_list. So we * abort to prevent writing a new superblock that reflects a * corrupt state (pointing to trees with unwritten nodes/leafs). */ if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; } } cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); spin_unlock(&fs_info->trans_lock); /* * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit */ start_time = ktime_get_ns(); extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); if (ret) goto lockdep_release; ret = btrfs_run_delayed_items(trans); if (ret) goto lockdep_release; /* * The thread has started/joined the transaction thus it holds the * lockdep map as a reader. It has to release it before acquiring the * lockdep map as a writer. */ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters); wait_event(cur_trans->writer_wait, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ ret = btrfs_run_delayed_items(trans); if (ret) { btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; } btrfs_wait_delalloc_flush(fs_info); /* * Wait for all ordered extents started by a fast fsync that joined this * transaction. Otherwise if this transaction commits before the ordered * extents complete we lose logged data after a power failure. */ btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered); wait_event(cur_trans->pending_wait, atomic_read(&cur_trans->pending_ordered) == 0); btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&fs_info->trans_lock); add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); /* * The thread has started/joined the transaction thus it holds the * lockdep map as a reader. It has to release it before acquiring the * lockdep map as a writer. */ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); /* * Make lockdep happy by acquiring the state locks after * btrfs_trans_num_writers is released. If we acquired the state locks * before releasing the btrfs_trans_num_writers lock then lockdep would * complain because we did not follow the reverse order unlocking rule. */ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); /* * We've started the commit, clear the flag in case we were triggered to * do an async commit but somebody else started before the transaction * kthread could do the work. */ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); goto scrub_continue; } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving * extents around in the middle of the commit */ mutex_lock(&fs_info->reloc_mutex); /* * We needn't worry about the delayed items because we will * deal with them in create_pending_snapshot(), which is the * core function of the snapshot creation. */ ret = create_pending_snapshots(trans); if (ret) goto unlock_reloc; /* * We insert the dir indexes of the snapshots and update the inode * of the snapshots' parents after the snapshot creation, so there * are some delayed items which are not dealt with. Now deal with * them. * * We needn't worry that this operation will corrupt the snapshots, * because all the tree which are snapshoted will be forced to COW * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); if (ret) goto unlock_reloc; ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) goto unlock_reloc; /* * make sure none of the code above managed to slip in a * delayed item */ btrfs_assert_delayed_root_empty(fs_info); WARN_ON(cur_trans != trans->transaction); ret = commit_fs_roots(trans); if (ret) goto unlock_reloc; /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ btrfs_free_log_root_tree(trans, fs_info); /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ ret = btrfs_qgroup_account_extents(trans); if (ret < 0) goto unlock_reloc; ret = commit_cowonly_roots(trans); if (ret) goto unlock_reloc; /* * The tasks which save the space cache and inode cache may also * update ->aborted, check it. */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto unlock_reloc; } cur_trans = fs_info->running_transaction; btrfs_set_root_node(&fs_info->tree_root->root_item, fs_info->tree_root->node); list_add_tail(&fs_info->tree_root->dirty_list, &cur_trans->switch_commits); btrfs_set_root_node(&fs_info->chunk_root->root_item, fs_info->chunk_root->node); list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_set_root_node(&fs_info->block_group_root->root_item, fs_info->block_group_root->node); list_add_tail(&fs_info->block_group_root->dirty_list, &cur_trans->switch_commits); } switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(fs_info); btrfs_set_super_log_root(fs_info->super_copy, 0); btrfs_set_super_log_root_level(fs_info->super_copy, 0); memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_copy)); btrfs_commit_device_sizes(cur_trans); clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); btrfs_trans_release_chunk_metadata(trans); /* * Before changing the transaction state to TRANS_STATE_UNBLOCKED and * setting fs_info->running_transaction to NULL, lock tree_log_mutex to * make sure that before we commit our superblock, no other task can * start a new transaction and commit a log tree before we commit our * superblock. Anyone trying to commit a log tree locks this mutex before * writing its superblock. */ mutex_lock(&fs_info->tree_log_mutex); spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; fs_info->running_transaction = NULL; spin_unlock(&fs_info->trans_lock); mutex_unlock(&fs_info->reloc_mutex); wake_up(&fs_info->transaction_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); /* If we have features changed, wake up the cleaner to update sysfs. */ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && fs_info->cleaner_kthread) wake_up_process(fs_info->cleaner_kthread); ret = btrfs_write_and_wait_transaction(trans); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers * to go about their business */ mutex_unlock(&fs_info->tree_log_mutex); if (ret) goto scrub_continue; /* * We needn't acquire the lock here because there is no other task * which can change it. */ cur_trans->state = TRANS_STATE_SUPER_COMMITTED; wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); btrfs_set_last_trans_committed(fs_info, cur_trans->transid); /* * We needn't acquire the lock here because there is no other task * which can change it. */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); trace_btrfs_transaction_commit(fs_info); interval = ktime_get_ns() - start_time; btrfs_scrub_continue(fs_info); if (current->journal_info == trans) current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); update_commit_stats(fs_info, interval); return ret; unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); scrub_continue: btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); btrfs_cleanup_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; btrfs_warn(fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; cleanup_transaction(trans, ret); return ret; lockdep_release: btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; lockdep_trans_commit_start_release: btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_end_transaction(trans); return ret; } /* * return < 0 if error * 0 if there are no more dead_roots at the time of call * 1 there are more to be processed, call me again * * The return value indicates there are certainly more snapshots to delete, but * if there comes a new one during processing, it may return 0. We don't mind, * because btrfs_commit_super will poke cleaner thread and it will process it a * few seconds later. */ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; int ret; spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { spin_unlock(&fs_info->trans_lock); return 0; } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); btrfs_debug(fs_info, "cleaner removing %llu", btrfs_root_id(root)); btrfs_kill_all_delayed_nodes(root); if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) ret = btrfs_drop_snapshot(root, 0, 0); else ret = btrfs_drop_snapshot(root, 1, 0); btrfs_put_root(root); return (ret < 0) ? 0 : 1; } /* * We only mark the transaction aborted and then set the file system read-only. * This will prevent new transactions from starting or trying to join this * one. * * This means that error recovery at the call site is limited to freeing * any local memory allocations and passing the error code up without * further cleanup. The transaction should complete as it normally would * in the call path but will return -EIO. * * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, unsigned int line, int error, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; WRITE_ONCE(trans->aborted, error); WRITE_ONCE(trans->transaction->aborted, error); if (first_hit && error == -ENOSPC) btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); __btrfs_handle_fs_error(fs_info, function, line, error, NULL); } int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = KMEM_CACHE(btrfs_trans_handle, SLAB_TEMPORARY); if (!btrfs_trans_handle_cachep) return -ENOMEM; return 0; } void __cold btrfs_transaction_exit(void) { kmem_cache_destroy(btrfs_trans_handle_cachep); } |
23 23 23 102 101 101 98 97 98 98 97 97 98 1 1 103 103 103 103 103 102 102 102 103 24 24 24 24 103 24 24 21 3 21 102 33 103 103 103 103 28 101 98 10 98 93 93 93 93 10 10 10 98 5 98 5 98 24 29 24 24 24 1 98 24 98 22 21 103 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 | // SPDX-License-Identifier: GPL-2.0+ /* * NILFS recovery logic * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/crc32.h> #include "nilfs.h" #include "segment.h" #include "sufile.h" #include "page.h" #include "segbuf.h" /* * Segment check result */ enum { NILFS_SEG_VALID, NILFS_SEG_NO_SUPER_ROOT, NILFS_SEG_FAIL_IO, NILFS_SEG_FAIL_MAGIC, NILFS_SEG_FAIL_SEQ, NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, NILFS_SEG_FAIL_CHECKSUM_FULL, NILFS_SEG_FAIL_CONSISTENCY, }; /* work structure for recovery */ struct nilfs_recovery_block { ino_t ino; /* * Inode number of the file that this block * belongs to */ sector_t blocknr; /* block number */ __u64 vblocknr; /* virtual block number */ unsigned long blkoff; /* File offset of the data block (per block) */ struct list_head list; }; static int nilfs_warn_segment_error(struct super_block *sb, int err) { const char *msg = NULL; switch (err) { case NILFS_SEG_FAIL_IO: nilfs_err(sb, "I/O error reading segment"); return -EIO; case NILFS_SEG_FAIL_MAGIC: msg = "Magic number mismatch"; break; case NILFS_SEG_FAIL_SEQ: msg = "Sequence number mismatch"; break; case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: msg = "Checksum error in super root"; break; case NILFS_SEG_FAIL_CHECKSUM_FULL: msg = "Checksum error in segment payload"; break; case NILFS_SEG_FAIL_CONSISTENCY: msg = "Inconsistency found"; break; case NILFS_SEG_NO_SUPER_ROOT: msg = "No super root in the last segment"; break; default: nilfs_err(sb, "unrecognized segment error %d", err); return -EINVAL; } nilfs_warn(sb, "invalid segment: %s", msg); return -EINVAL; } /** * nilfs_compute_checksum - compute checksum of blocks continuously * @nilfs: nilfs object * @bhs: buffer head of start block * @sum: place to store result * @offset: offset bytes in the first block * @check_bytes: number of bytes to be checked * @start: DBN of start block * @nblock: number of blocks to be checked * * Return: 0 on success, or %-EIO if an I/O error occurs. */ static int nilfs_compute_checksum(struct the_nilfs *nilfs, struct buffer_head *bhs, u32 *sum, unsigned long offset, u64 check_bytes, sector_t start, unsigned long nblock) { unsigned int blocksize = nilfs->ns_blocksize; unsigned long size; u32 crc; BUG_ON(offset >= blocksize); check_bytes -= offset; size = min_t(u64, check_bytes, blocksize - offset); crc = crc32_le(nilfs->ns_crc_seed, (unsigned char *)bhs->b_data + offset, size); if (--nblock > 0) { do { struct buffer_head *bh; bh = __bread(nilfs->ns_bdev, ++start, blocksize); if (!bh) return -EIO; check_bytes -= size; size = min_t(u64, check_bytes, blocksize); crc = crc32_le(crc, bh->b_data, size); brelse(bh); } while (--nblock > 0); } *sum = crc; return 0; } /** * nilfs_read_super_root_block - read super root block * @nilfs: nilfs object * @sr_block: disk block number of the super root block * @pbh: address of a buffer_head pointer to return super root buffer * @check: CRC check flag * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Super root block corrupted. * * %-EIO - I/O error. */ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, struct buffer_head **pbh, int check) { struct buffer_head *bh_sr; struct nilfs_super_root *sr; u32 crc; int ret; *pbh = NULL; bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); if (unlikely(!bh_sr)) { ret = NILFS_SEG_FAIL_IO; goto failed; } sr = (struct nilfs_super_root *)bh_sr->b_data; if (check) { unsigned int bytes = le16_to_cpu(sr->sr_bytes); if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } if (nilfs_compute_checksum( nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, sr_block, 1)) { ret = NILFS_SEG_FAIL_IO; goto failed_bh; } if (crc != le32_to_cpu(sr->sr_sum)) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } } *pbh = bh_sr; return 0; failed_bh: brelse(bh_sr); failed: return nilfs_warn_segment_error(nilfs->ns_sb, ret); } /** * nilfs_read_log_header - read summary header of the specified log * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: pointer to return segment summary structure * * Return: Buffer head pointer, or NULL if an I/O error occurs. */ static struct buffer_head * nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary **sum) { struct buffer_head *bh_sum; bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (bh_sum) *sum = (struct nilfs_segment_summary *)bh_sum->b_data; return bh_sum; } /** * nilfs_validate_log - verify consistency of log * @nilfs: nilfs object * @seg_seq: sequence number of segment * @bh_sum: buffer head of summary block * @sum: segment summary struct * * Return: 0 on success, or one of the following internal codes on failure: * * %NILFS_SEG_FAIL_MAGIC - Magic number mismatch. * * %NILFS_SEG_FAIL_SEQ - Sequence number mismatch. * * %NIFLS_SEG_FAIL_CONSISTENCY - Block count out of range. * * %NILFS_SEG_FAIL_IO - I/O error. * * %NILFS_SEG_FAIL_CHECKSUM_FULL - Full log checksum verification failed. */ static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, struct buffer_head *bh_sum, struct nilfs_segment_summary *sum) { unsigned long nblock; u32 crc; int ret; ret = NILFS_SEG_FAIL_MAGIC; if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) goto out; ret = NILFS_SEG_FAIL_SEQ; if (le64_to_cpu(sum->ss_seq) != seg_seq) goto out; nblock = le32_to_cpu(sum->ss_nblocks); ret = NILFS_SEG_FAIL_CONSISTENCY; if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) /* This limits the number of blocks read in the CRC check */ goto out; ret = NILFS_SEG_FAIL_IO; if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), ((u64)nblock << nilfs->ns_blocksize_bits), bh_sum->b_blocknr, nblock)) goto out; ret = NILFS_SEG_FAIL_CHECKSUM_FULL; if (crc != le32_to_cpu(sum->ss_datasum)) goto out; ret = 0; out: return ret; } /** * nilfs_read_summary_info - read an item on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be read * * Return: Kernel space address of current segment summary entry, or * NULL if an I/O error occurs. */ static void *nilfs_read_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes) { void *ptr; sector_t blocknr; BUG_ON((*pbh)->b_size < *offset); if (bytes > (*pbh)->b_size - *offset) { blocknr = (*pbh)->b_blocknr; brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + 1, nilfs->ns_blocksize); if (unlikely(!*pbh)) return NULL; *offset = 0; } ptr = (*pbh)->b_data + *offset; *offset += bytes; return ptr; } /** * nilfs_skip_summary_info - skip items on summary blocks of a log * @nilfs: nilfs object * @pbh: the current buffer head on summary blocks [in, out] * @offset: the current byte offset on summary blocks [in, out] * @bytes: byte size of the item to be skipped * @count: number of items to be skipped */ static void nilfs_skip_summary_info(struct the_nilfs *nilfs, struct buffer_head **pbh, unsigned int *offset, unsigned int bytes, unsigned long count) { unsigned int rest_item_in_current_block = ((*pbh)->b_size - *offset) / bytes; if (count <= rest_item_in_current_block) { *offset += bytes * count; } else { sector_t blocknr = (*pbh)->b_blocknr; unsigned int nitem_per_block = (*pbh)->b_size / bytes; unsigned int bcnt; count -= rest_item_in_current_block; bcnt = DIV_ROUND_UP(count, nitem_per_block); *offset = bytes * (count - (bcnt - 1) * nitem_per_block); brelse(*pbh); *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, nilfs->ns_blocksize); } } /** * nilfs_scan_dsync_log - get block information of a log written for data sync * @nilfs: nilfs object * @start_blocknr: start block number of the log * @sum: log summary information * @head: list head to add nilfs_recovery_block struct * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, struct nilfs_segment_summary *sum, struct list_head *head) { struct buffer_head *bh; unsigned int offset; u32 nfinfo, sumbytes; sector_t blocknr; ino_t ino; int err = -EIO; nfinfo = le32_to_cpu(sum->ss_nfinfo); if (!nfinfo) return 0; sumbytes = le32_to_cpu(sum->ss_sumbytes); blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (unlikely(!bh)) goto out; offset = le16_to_cpu(sum->ss_bytes); for (;;) { unsigned long nblocks, ndatablk, nnodeblk; struct nilfs_finfo *finfo; finfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*finfo)); if (unlikely(!finfo)) goto out; ino = le64_to_cpu(finfo->fi_ino); nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); nnodeblk = nblocks - ndatablk; while (ndatablk-- > 0) { struct nilfs_recovery_block *rb; struct nilfs_binfo_v *binfo; binfo = nilfs_read_summary_info(nilfs, &bh, &offset, sizeof(*binfo)); if (unlikely(!binfo)) goto out; rb = kmalloc(sizeof(*rb), GFP_NOFS); if (unlikely(!rb)) { err = -ENOMEM; goto out; } rb->ino = ino; rb->blocknr = blocknr++; rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); rb->blkoff = le64_to_cpu(binfo->bi_blkoff); /* INIT_LIST_HEAD(&rb->list); */ list_add_tail(&rb->list, head); } if (--nfinfo == 0) break; blocknr += nnodeblk; /* always 0 for data sync logs */ nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), nnodeblk); if (unlikely(!bh)) goto out; } err = 0; out: brelse(bh); /* brelse(NULL) is just ignored */ return err; } static void dispose_recovery_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_recovery_block *rb; rb = list_first_entry(head, struct nilfs_recovery_block, list); list_del(&rb->list); kfree(rb); } } struct nilfs_segment_entry { struct list_head list; __u64 segnum; }; static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) { struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); if (unlikely(!ent)) return -ENOMEM; ent->segnum = segnum; INIT_LIST_HEAD(&ent->list); list_add_tail(&ent->list, head); return 0; } void nilfs_dispose_segment_list(struct list_head *head) { while (!list_empty(head)) { struct nilfs_segment_entry *ent; ent = list_first_entry(head, struct nilfs_segment_entry, list); list_del(&ent->list); kfree(ent); } } static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct list_head *head = &ri->ri_used_segments; struct nilfs_segment_entry *ent, *n; struct inode *sufile = nilfs->ns_sufile; __u64 segnum[4]; int err; int i; segnum[0] = nilfs->ns_segnum; segnum[1] = nilfs->ns_nextnum; segnum[2] = ri->ri_segnum; segnum[3] = ri->ri_nextnum; /* * Releasing the next segment of the latest super root. * The next segment is invalidated by this recovery. */ err = nilfs_sufile_free(sufile, segnum[1]); if (unlikely(err)) { if (err == -ENOENT) { nilfs_err(sb, "checkpoint log inconsistency at block %llu (segment %llu): next segment %llu is unallocated", (unsigned long long)nilfs->ns_last_pseg, (unsigned long long)nilfs->ns_segnum, (unsigned long long)segnum[1]); err = -EINVAL; } goto failed; } for (i = 1; i < 4; i++) { err = nilfs_segment_list_add(head, segnum[i]); if (unlikely(err)) goto failed; } /* * Collecting segments written after the latest super root. * These are marked dirty to avoid being reallocated in the next write. */ list_for_each_entry_safe(ent, n, head, list) { if (ent->segnum != segnum[0]) { err = nilfs_sufile_scrap(sufile, ent->segnum); if (unlikely(err)) goto failed; } list_del(&ent->list); kfree(ent); } /* Allocate new segments for recovery */ err = nilfs_sufile_alloc(sufile, &segnum[0]); if (unlikely(err)) goto failed; nilfs->ns_pseg_offset = 0; nilfs->ns_seg_seq = ri->ri_seq + 2; nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; failed: /* No need to recover sufile because it will be destroyed on error */ return err; } static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, loff_t pos, struct folio *folio) { struct buffer_head *bh_org; size_t from = offset_in_folio(folio, pos); bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; memcpy_to_folio(folio, from, bh_org->b_data, bh_org->b_size); brelse(bh_org); return 0; } static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct list_head *head, unsigned long *nr_salvaged_blocks) { struct inode *inode; struct nilfs_recovery_block *rb, *n; unsigned int blocksize = nilfs->ns_blocksize; struct folio *folio; loff_t pos; int err = 0, err2 = 0; list_for_each_entry_safe(rb, n, head, list) { inode = nilfs_iget(sb, root, rb->ino); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; goto failed_inode; } pos = rb->blkoff << inode->i_blkbits; err = block_write_begin(inode->i_mapping, pos, blocksize, &folio, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; if (pos + blocksize > isize) nilfs_write_failed(inode->i_mapping, pos + blocksize); goto failed_inode; } err = nilfs_recovery_copy_block(nilfs, rb, pos, folio); if (unlikely(err)) goto failed_folio; err = nilfs_set_file_dirty(inode, 1); if (unlikely(err)) goto failed_folio; block_write_end(NULL, inode->i_mapping, pos, blocksize, blocksize, folio, NULL); folio_unlock(folio); folio_put(folio); (*nr_salvaged_blocks)++; goto next; failed_folio: folio_unlock(folio); folio_put(folio); failed_inode: nilfs_warn(sb, "error %d recovering data block (ino=%lu, block-offset=%llu)", err, (unsigned long)rb->ino, (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: iput(inode); /* iput(NULL) is just ignored */ list_del_init(&rb->list); kfree(rb); } return err2; } /** * nilfs_do_roll_forward - salvage logical segments newer than the latest * checkpoint * @nilfs: nilfs object * @sb: super block instance * @root: NILFS root instance * @ri: pointer to a nilfs_recovery_info * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Log format error. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_root *root, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; int empty_seg = 0; int err = 0, ret; LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ enum { RF_INIT_ST, RF_DSYNC_ST, /* scanning data-sync segments */ }; int state = RF_INIT_ST; pseg_start = ri->ri_lsegs_start; seg_seq = ri->ri_lsegs_start_seq; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { brelse(bh_sum); bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) { err = -EIO; goto failed; } ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) { err = -EIO; goto failed; } goto strayed; } flags = le16_to_cpu(sum->ss_flags); if (flags & NILFS_SS_SR) goto confused; /* Found a valid partial segment; do recovery actions */ nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); empty_seg = 0; nilfs->ns_ctime = le64_to_cpu(sum->ss_create); if (!(flags & NILFS_SS_GC)) nilfs->ns_nongc_ctime = nilfs->ns_ctime; switch (state) { case RF_INIT_ST: if (!(flags & NILFS_SS_LOGBGN) || !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; fallthrough; case RF_DSYNC_ST: if (!(flags & NILFS_SS_SYNDT)) goto confused; err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, &dsync_blocks); if (unlikely(err)) goto failed; if (flags & NILFS_SS_LOGEND) { err = nilfs_recover_dsync_blocks( nilfs, sb, root, &dsync_blocks, &nsalvaged_blocks); if (unlikely(err)) goto failed; state = RF_INIT_ST; } break; /* Fall through to try_next_pseg */ } try_next_pseg: if (pseg_start == ri->ri_lsegs_end) break; pseg_start += le32_to_cpu(sum->ss_nblocks); if (pseg_start < seg_end) continue; goto feed_segment; strayed: if (pseg_start == ri->ri_lsegs_end) break; feed_segment: /* Looking to the next full segment */ if (empty_seg++) break; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } if (nsalvaged_blocks) { nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks); ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: brelse(bh_sum); dispose_recovery_list(&dsync_blocks); return err; confused: err = -EINVAL; failed: nilfs_err(sb, "error %d roll-forwarding partial segment at blocknr = %llu", err, (unsigned long long)pseg_start); goto out; } static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh; int err; if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) return; bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); if (WARN_ON(!bh)) return; /* should never happen */ lock_buffer(bh); memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); set_buffer_dirty(bh); unlock_buffer(bh); err = sync_dirty_buffer(bh); if (unlikely(err)) nilfs_warn(nilfs->ns_sb, "buffer sync write failed during post-cleaning of recovery."); brelse(bh); } /** * nilfs_abort_roll_forward - cleaning up after a failed rollforward recovery * @nilfs: nilfs object */ static void nilfs_abort_roll_forward(struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; LIST_HEAD(head); /* Abandon inodes that have read recovery data */ spin_lock(&nilfs->ns_inode_lock); list_splice_init(&nilfs->ns_dirty_files, &head); spin_unlock(&nilfs->ns_inode_lock); if (list_empty(&head)) return; set_nilfs_purging(nilfs); list_for_each_entry_safe(ii, n, &head, i_dirty) { spin_lock(&nilfs->ns_inode_lock); list_del_init(&ii->i_dirty); spin_unlock(&nilfs->ns_inode_lock); iput(&ii->vfs_inode); } clear_nilfs_purging(nilfs); } /** * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint * @nilfs: nilfs object * @sb: super block instance * @ri: pointer to a nilfs_recovery_info struct to store search results. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - Inconsistent filesystem state. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. * * %-ENOSPC - No space left on device (only in a panic state). * * %-ERESTARTSYS - Interrupted. */ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb, struct nilfs_recovery_info *ri) { struct nilfs_root *root; int err; if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) return 0; err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root); if (unlikely(err)) { nilfs_err(sb, "error %d loading the latest checkpoint", err); return err; } err = nilfs_do_roll_forward(nilfs, sb, root, ri); if (unlikely(err)) goto failed; if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri); if (unlikely(err)) { nilfs_err(sb, "error %d preparing segment for recovery", err); goto failed; } err = nilfs_attach_log_writer(sb, root); if (unlikely(err)) goto failed; set_nilfs_discontinued(nilfs); err = nilfs_construct_segment(sb); nilfs_detach_log_writer(sb); if (unlikely(err)) { nilfs_err(sb, "error %d writing segment for recovery", err); goto put_root; } nilfs_finish_roll_forward(nilfs, ri); } put_root: nilfs_put_root(root); return err; failed: nilfs_abort_roll_forward(nilfs); goto put_root; } /** * nilfs_search_super_root - search the latest valid super root * @nilfs: the_nilfs * @ri: pointer to a nilfs_recovery_info struct to store search results. * * nilfs_search_super_root() looks for the latest super-root from a partial * segment pointed by the superblock. It sets up struct the_nilfs through * this search. It fills nilfs_recovery_info (ri) required for recovery. * * Return: 0 on success, or one of the following negative error codes on * failure: * * %-EINVAL - No valid segment found. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient memory available. */ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { struct buffer_head *bh_sum = NULL; struct nilfs_segment_summary *sum = NULL; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; unsigned long nblocks; unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; LIST_HEAD(segments); int empty_seg = 0, scan_newer = 0; int ret; pseg_start = nilfs->ns_last_pseg; seg_seq = nilfs->ns_last_seq; cno = nilfs->ns_last_cno; segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); /* Calculate range of segment */ nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); /* Read ahead segment */ b = seg_start; while (b <= seg_end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); for (;;) { brelse(bh_sum); ret = NILFS_SEG_FAIL_IO; bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); if (!bh_sum) goto failed; ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) goto failed; goto strayed; } nblocks = le32_to_cpu(sum->ss_nblocks); pseg_end = pseg_start + nblocks - 1; if (unlikely(pseg_end > seg_end)) { ret = NILFS_SEG_FAIL_CONSISTENCY; goto strayed; } /* A valid partial segment */ ri->ri_pseg_start = pseg_start; ri->ri_seq = seg_seq; ri->ri_segnum = segnum; nextnum = nilfs_get_segnum_of_block(nilfs, le64_to_cpu(sum->ss_next)); ri->ri_nextnum = nextnum; empty_seg = 0; flags = le16_to_cpu(sum->ss_flags); if (!(flags & NILFS_SS_SR) && !scan_newer) { /* * This will never happen because a superblock * (last_segment) always points to a pseg with * a super root. */ ret = NILFS_SEG_FAIL_CONSISTENCY; goto failed; } if (pseg_start == seg_start) { nilfs_get_segment_range(nilfs, nextnum, &b, &end); while (b <= end) __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); } if (!(flags & NILFS_SS_SR)) { if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; } if (flags & NILFS_SS_LOGEND) ri->ri_lsegs_end = pseg_start; goto try_next_pseg; } /* A valid super root was found. */ ri->ri_cno = cno++; ri->ri_super_root = pseg_end; ri->ri_lsegs_start = ri->ri_lsegs_end = 0; nilfs_dispose_segment_list(&segments); sr_pseg_start = pseg_start; nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; nilfs->ns_seg_seq = seg_seq; nilfs->ns_segnum = segnum; nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ nilfs->ns_ctime = le64_to_cpu(sum->ss_create); nilfs->ns_nextnum = nextnum; if (scan_newer) ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; else { if (nilfs->ns_mount_state & NILFS_VALID_FS) goto super_root_found; scan_newer = 1; } try_next_pseg: /* Standing on a course, or met an inconsistent state */ pseg_start += nblocks; if (pseg_start < seg_end) continue; goto feed_segment; strayed: /* Off the trail */ if (!scan_newer) /* * This can happen if a checkpoint was written without * barriers, or as a result of an I/O failure. */ goto failed; feed_segment: /* Looking to the next full segment */ if (empty_seg++) goto super_root_found; /* found a valid super root */ ret = nilfs_segment_list_add(&segments, segnum); if (unlikely(ret)) goto failed; seg_seq++; segnum = nextnum; nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); pseg_start = seg_start; } super_root_found: /* Updating pointers relating to the latest checkpoint */ brelse(bh_sum); list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; nilfs->ns_last_cno = ri->ri_cno; return 0; failed: brelse(bh_sum); nilfs_dispose_segment_list(&segments); return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret); } |
1 1 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/module.h> #include <linux/netdevice.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/cdc.h> #include <linux/usb/usbnet.h> #include <linux/usb/r8152.h> #define OCP_BASE 0xe86c static int pla_read_word(struct usbnet *dev, u16 index) { u16 byen = BYTE_EN_WORD; u8 shift = index & 2; __le32 tmp; int ret; if (shift) byen <<= shift; index &= ~3; ret = usbnet_read_cmd(dev, RTL8152_REQ_GET_REGS, RTL8152_REQT_READ, index, MCU_TYPE_PLA | byen, &tmp, sizeof(tmp)); if (ret < 0) goto out; ret = __le32_to_cpu(tmp); ret >>= (shift * 8); ret &= 0xffff; out: return ret; } static int pla_write_word(struct usbnet *dev, u16 index, u32 data) { u32 mask = 0xffff; u16 byen = BYTE_EN_WORD; u8 shift = index & 2; __le32 tmp; int ret; data &= mask; if (shift) { byen <<= shift; mask <<= (shift * 8); data <<= (shift * 8); } index &= ~3; ret = usbnet_read_cmd(dev, RTL8152_REQ_GET_REGS, RTL8152_REQT_READ, index, MCU_TYPE_PLA | byen, &tmp, sizeof(tmp)); if (ret < 0) goto out; data |= __le32_to_cpu(tmp) & ~mask; tmp = __cpu_to_le32(data); ret = usbnet_write_cmd(dev, RTL8152_REQ_SET_REGS, RTL8152_REQT_WRITE, index, MCU_TYPE_PLA | byen, &tmp, sizeof(tmp)); out: return ret; } static int r8153_ecm_mdio_read(struct net_device *netdev, int phy_id, int reg) { struct usbnet *dev = netdev_priv(netdev); int ret; ret = pla_write_word(dev, OCP_BASE, 0xa000); if (ret < 0) goto out; ret = pla_read_word(dev, 0xb400 + reg * 2); out: return ret; } static void r8153_ecm_mdio_write(struct net_device *netdev, int phy_id, int reg, int val) { struct usbnet *dev = netdev_priv(netdev); int ret; ret = pla_write_word(dev, OCP_BASE, 0xa000); if (ret < 0) return; ret = pla_write_word(dev, 0xb400 + reg * 2, val); } static int r8153_bind(struct usbnet *dev, struct usb_interface *intf) { int status; status = usbnet_cdc_bind(dev, intf); if (status < 0) return status; dev->mii.dev = dev->net; dev->mii.mdio_read = r8153_ecm_mdio_read; dev->mii.mdio_write = r8153_ecm_mdio_write; dev->mii.reg_num_mask = 0x1f; dev->mii.supports_gmii = 1; return status; } static const struct driver_info r8153_info = { .description = "RTL8153 ECM Device", .flags = FLAG_ETHER, .bind = r8153_bind, .unbind = usbnet_cdc_unbind, .status = usbnet_cdc_status, .manage_power = usbnet_manage_power, }; static const struct usb_device_id products[] = { /* Realtek RTL8153 Based USB 3.0 Ethernet Adapters */ { USB_DEVICE_AND_INTERFACE_INFO(VENDOR_ID_REALTEK, 0x8153, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&r8153_info, }, /* Lenovo Powered USB-C Travel Hub (4X90S92381, based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(VENDOR_ID_LENOVO, 0x721e, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&r8153_info, }, /* Lenovo ThinkPad Hybrid USB-C with USB-A Dock (40af0135eu, based on Realtek RTL8153) */ { USB_DEVICE_AND_INTERFACE_INFO(VENDOR_ID_LENOVO, 0xa359, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&r8153_info, }, { }, /* END */ }; MODULE_DEVICE_TABLE(usb, products); static int rtl8153_ecm_probe(struct usb_interface *intf, const struct usb_device_id *id) { #if IS_REACHABLE(CONFIG_USB_RTL8152) if (rtl8152_get_version(intf)) return -ENODEV; #endif return usbnet_probe(intf, id); } static struct usb_driver r8153_ecm_driver = { .name = "r8153_ecm", .id_table = products, .probe = rtl8153_ecm_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .reset_resume = usbnet_resume, .supports_autosuspend = 1, .disable_hub_initiated_lpm = 1, }; module_usb_driver(r8153_ecm_driver); MODULE_AUTHOR("Hayes Wang"); MODULE_DESCRIPTION("Realtek USB ECM device"); MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 7 7 6 5 1 5 5 5 5 5 5 5 5 5 1 4 5 5 5 1 4 5 1 5 5 5 13 13 2 1 13 13 12 6 6 5 5 5 5 1 1 1 1 3 3 2 2 2 2 2 1 1 1 1 1 3 1 1 1 1 2 1 1 1 1 1 1 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPV4 GSO/GRO offload support * Linux INET implementation * * UDPv4 GSO support */ #include <linux/skbuff.h> #include <net/gro.h> #include <net/gso.h> #include <net/udp.h> #include <net/protocol.h> #include <net/inet_common.h> static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features), __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); bool remcsum, need_csum, offload_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; __wsum partial; bool need_ipsec; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; /* Adjust partial header checksum to negate old length. * We cannot rely on the value contained in uh->len as it is * possible that the actual value exceeds the boundaries of the * 16 bit length field due to the header being added outside of an * IP or IPv6 frame that was already limited to 64K - 1. */ if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) partial = (__force __wsum)uh->len; else partial = (__force __wsum)htonl(skb->len); partial = csum_sub(csum_unfold(uh->check), partial); /* setup inner skb. */ skb->encapsulation = 0; SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb_set_transport_header(skb, skb_inner_transport_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); skb->encap_hdr_csum = need_csum; remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && !need_ipsec && (skb->dev->features & (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); features &= skb->dev->hw_enc_features; if (need_csum) features &= ~NETIF_F_SCTP_CRC; /* The only checksum offload we care about from here on out is the * outer one so strip the existing checksum feature flags and * instead set the flag based on our outer checksum offload value. */ if (remcsum) { features &= ~NETIF_F_CSUM_MASK; if (!need_csum || offload_csum) features |= NETIF_F_HW_CSUM; } /* segment inner packet. */ segs = gso_inner_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); goto out; } gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); outer_hlen = skb_tnl_header_len(skb); udp_offset = outer_hlen - tnl_hlen; skb = segs; do { unsigned int len; if (remcsum) skb->ip_summed = CHECKSUM_NONE; /* Set up inner headers if we are offloading inner checksum */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } skb->mac_len = mac_len; skb->protocol = protocol; __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); len = skb->len - udp_offset; uh = udp_hdr(skb); /* If we are only performing partial GSO the inner header * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); } else { uh->len = htons(len); } if (!need_csum) continue; uh->check = ~csum_fold(csum_add(partial, (__force __wsum)htonl(len))); if (skb->encapsulation || !offload_csum) { uh->check = gso_make_checksum(skb, ~uh->check); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); } } while ((skb = skb->next)); out: return segs; } struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) { const struct net_offload __rcu **offloads; __be16 protocol = skb->protocol; const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, netdev_features_t features); rcu_read_lock(); switch (skb->inner_protocol_type) { case ENCAP_TYPE_ETHER: protocol = skb->inner_protocol; gso_inner_segment = skb_mac_gso_segment; break; case ENCAP_TYPE_IPPROTO: offloads = is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[skb->inner_ipproto]); if (!ops || !ops->callbacks.gso_segment) goto out_unlock; gso_inner_segment = ops->callbacks.gso_segment; break; default: goto out_unlock; } segs = __skb_udp_tunnel_segment(skb, features, gso_inner_segment, protocol, is_ipv6); out_unlock: rcu_read_unlock(); return segs; } EXPORT_SYMBOL(skb_udp_tunnel_segment); static void __udpv4_gso_segment_csum(struct sk_buff *seg, __be32 *oldip, __be32 *newip, __be16 *oldport, __be16 *newport) { struct udphdr *uh; struct iphdr *iph; if (*oldip == *newip && *oldport == *newport) return; uh = udp_hdr(seg); iph = ip_hdr(seg); if (uh->check) { inet_proto_csum_replace4(&uh->check, seg, *oldip, *newip, true); inet_proto_csum_replace2(&uh->check, seg, *oldport, *newport, false); if (!uh->check) uh->check = CSUM_MANGLED_0; } *oldport = *newport; csum_replace4(&iph->check, *oldip, *newip); *oldip = *newip; } static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs) { struct sk_buff *seg; struct udphdr *uh, *uh2; struct iphdr *iph, *iph2; seg = segs; uh = udp_hdr(seg); iph = ip_hdr(seg); if ((udp_hdr(seg)->dest == udp_hdr(seg->next)->dest) && (udp_hdr(seg)->source == udp_hdr(seg->next)->source) && (ip_hdr(seg)->daddr == ip_hdr(seg->next)->daddr) && (ip_hdr(seg)->saddr == ip_hdr(seg->next)->saddr)) return segs; while ((seg = seg->next)) { uh2 = udp_hdr(seg); iph2 = ip_hdr(seg); __udpv4_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, &uh2->source, &uh->source); __udpv4_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, &uh2->dest, &uh->dest); } return segs; } static void __udpv6_gso_segment_csum(struct sk_buff *seg, struct in6_addr *oldip, const struct in6_addr *newip, __be16 *oldport, __be16 newport) { struct udphdr *uh = udp_hdr(seg); if (ipv6_addr_equal(oldip, newip) && *oldport == newport) return; if (uh->check) { inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32, newip->s6_addr32, true); inet_proto_csum_replace2(&uh->check, seg, *oldport, newport, false); if (!uh->check) uh->check = CSUM_MANGLED_0; } *oldip = *newip; *oldport = newport; } static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs) { const struct ipv6hdr *iph; const struct udphdr *uh; struct ipv6hdr *iph2; struct sk_buff *seg; struct udphdr *uh2; seg = segs; uh = udp_hdr(seg); iph = ipv6_hdr(seg); uh2 = udp_hdr(seg->next); iph2 = ipv6_hdr(seg->next); if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) && ipv6_addr_equal(&iph->saddr, &iph2->saddr) && ipv6_addr_equal(&iph->daddr, &iph2->daddr)) return segs; while ((seg = seg->next)) { uh2 = udp_hdr(seg); iph2 = ipv6_hdr(seg); __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, &uh2->source, uh->source); __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, &uh2->dest, uh->dest); } return segs; } static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) { unsigned int mss = skb_shinfo(skb)->gso_size; skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); if (IS_ERR(skb)) return skb; udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); if (is_ipv6) return __udpv6_gso_segment_list_csum(skb); else return __udpv4_gso_segment_list_csum(skb); } struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6) { struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; struct sk_buff *segs, *seg; struct udphdr *uh; unsigned int mss; bool copy_dtor; __sum16 check; __be16 newlen; mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); if (unlikely(skb_checksum_start(gso_skb) != skb_transport_header(gso_skb) && !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST))) return ERR_PTR(-EINVAL); /* We don't know if egress device can segment and checksum the packet * when IPv6 extension headers are present. Fall back to software GSO. */ if (gso_skb->ip_summed != CHECKSUM_PARTIAL) features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK); if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), mss); return NULL; } if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) { /* Detect modified geometry and pass those to skb_segment. */ if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) return __udp_gso_segment_list(gso_skb, features, is_ipv6); /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); gso_skb->ip_summed = CHECKSUM_PARTIAL; uh = udp_hdr(gso_skb); if (is_ipv6) uh->check = ~udp_v6_check(gso_skb->len, &ipv6_hdr(gso_skb)->saddr, &ipv6_hdr(gso_skb)->daddr, 0); else uh->check = ~udp_v4_check(gso_skb->len, ip_hdr(gso_skb)->saddr, ip_hdr(gso_skb)->daddr, 0); } skb_pull(gso_skb, sizeof(*uh)); /* clear destructor to avoid skb_segment assigning it to tail */ copy_dtor = gso_skb->destructor == sock_wfree; if (copy_dtor) { gso_skb->destructor = NULL; gso_skb->sk = NULL; } segs = skb_segment(gso_skb, features); if (IS_ERR_OR_NULL(segs)) { if (copy_dtor) { gso_skb->destructor = sock_wfree; gso_skb->sk = sk; } return segs; } /* GSO partial and frag_list segmentation only requires splitting * the frame into an MSS multiple and possibly a remainder, both * cases return a GSO skb. So update the mss now. */ if (skb_is_gso(segs)) mss *= skb_shinfo(segs)->gso_segs; seg = segs; uh = udp_hdr(seg); /* preserve TX timestamp flags and TS key for first segment */ skb_shinfo(seg)->tskey = skb_shinfo(gso_skb)->tskey; skb_shinfo(seg)->tx_flags |= (skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP); /* compute checksum adjustment based on old length versus new */ newlen = htons(sizeof(*uh) + mss); check = csum16_add(csum16_sub(uh->check, uh->len), newlen); for (;;) { if (copy_dtor) { seg->destructor = sock_wfree; seg->sk = sk; sum_truesize += seg->truesize; } if (!seg->next) break; uh->len = newlen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(seg, ~check); else uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0; seg = seg->next; uh = udp_hdr(seg); } /* last packet can be partial gso_size, account for that in checksum */ newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) + seg->data_len); check = csum16_add(csum16_sub(uh->check, uh->len), newlen); uh->len = newlen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) gso_reset_checksum(seg, ~check); else uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0; /* On the TX path, CHECKSUM_NONE and CHECKSUM_UNNECESSARY have the same * meaning. However, check for bad offloads in the GSO stack expects the * latter, if the checksum was calculated in software. To vouch for the * segment skbs we actually need to set it on the gso_skb. */ if (gso_skb->ip_summed == CHECKSUM_NONE) gso_skb->ip_summed = CHECKSUM_UNNECESSARY; /* update refcount for the packet */ if (copy_dtor) { int delta = sum_truesize - gso_skb->truesize; /* In some pathological cases, delta can be negative. * We need to either use refcount_add() or refcount_sub_and_test() */ if (likely(delta >= 0)) refcount_add(delta, &sk->sk_wmem_alloc); else WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); } return segs; } EXPORT_SYMBOL_GPL(__udp_gso_segment); static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; __wsum csum; struct udphdr *uh; struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { segs = skb_udp_tunnel_segment(skb, features, false); goto out; } if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4))) goto out; if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) return __udp_gso_segment(skb, features, false); mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; /* Do software UFO. Complete and fill in the UDP checksum as * HW cannot do checksum of UDP packets sent as multiple * IP fragments. */ uh = udp_hdr(skb); iph = ip_hdr(skb); uh->check = 0; csum = skb_checksum(skb, 0, skb->len, 0); uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in * software prior to segmenting the frame. */ if (!skb->encap_hdr_csum) features |= NETIF_F_HW_CSUM; /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ segs = skb_segment(skb, features); out: return segs; } #define UDP_GRO_CNT_MAX 64 static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; unsigned int ulen; int ret = 0; int flush; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } /* Do not deal with padded or malicious packets, sorry ! */ ulen = ntohs(uh->len); if (ulen <= sizeof(*uh) || ulen != skb_gro_len(skb)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; uh2 = udp_hdr(p); /* Match ports only, as csum is always non zero */ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) { NAPI_GRO_CB(skb)->flush = 1; return p; } flush = gro_receive_network_flush(uh, uh2, p); /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ if (ulen > ntohs(uh2->len) || flush) { pp = p; } else { if (NAPI_GRO_CB(skb)->is_flist) { if (!pskb_may_pull(skb, skb_gro_offset(skb))) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } if ((skb->ip_summed != p->ip_summed) || (skb->csum_level != p->csum_level)) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } ret = skb_gro_receive_list(p, skb); } else { skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); ret = skb_gro_receive(p, skb); } } if (ret || ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; return pp; } /* mismatch, but we never need to flush */ return NULL; } struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk) { struct sk_buff *pp = NULL; struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; /* We can do L4 aggregation only if the packet can't land in a tunnel * otherwise we could corrupt the inner stream. Detecting such packets * cannot be foolproof and the aggregation might still happen in some * cases. Such packets should be caught in udp_unexpected_gso later. */ NAPI_GRO_CB(skb)->is_flist = 0; if (!sk || !udp_sk(sk)->gro_receive) { /* If the packet was locally encapsulated in a UDP tunnel that * wasn't detected above, do not GRO. */ if (skb->encapsulation) goto out; if (skb->dev->features & NETIF_F_GRO_FRAGLIST) NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist) return call_gro_receive(udp_gro_receive_segment, head, skb); /* no GRO, be sure flush the current packet */ goto out; } if (NAPI_GRO_CB(skb)->encap_mark || (uh->check && skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; flush = 0; list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; uh2 = (struct udphdr *)(p->data + off); /* Match ports and either checksums are either both zero * or nonzero. */ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) || (!uh->check ^ !uh2->check)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, iph->daddr, dport, iif, sdif, net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); struct sock *sk = NULL; struct sk_buff *pp; if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ if (NAPI_GRO_CB(skb)->flush) goto skip; if (skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check, inet_gro_compute_pseudo)) goto flush; else if (uh->check) skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; if (static_branch_unlikely(&udp_encap_needed_key)) sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); pp = udp_gro_receive(head, skb, uh, sk); return pp; flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; } static int udp_gro_complete_segment(struct sk_buff *skb) { struct udphdr *uh = udp_hdr(skb); skb->csum_start = (unsigned char *)uh - skb->head; skb->csum_offset = offsetof(struct udphdr, check); skb->ip_summed = CHECKSUM_PARTIAL; skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4; if (skb->encapsulation) skb->inner_transport_header = skb->transport_header; return 0; } int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup) { __be16 newlen = htons(skb->len - nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); struct sock *sk; int err; uh->len = newlen; sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, udp4_lib_lookup_skb, skb, uh->source, uh->dest); if (sk && udp_sk(sk)->gro_complete) { skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; /* clear the encap mark, so that inner frag_list gro_complete * can take place */ NAPI_GRO_CB(skb)->encap_mark = 0; /* Set encapsulation before calling into inner gro_complete() * functions to make them set up the inner offsets. */ skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); } else { err = udp_gro_complete_segment(skb); } if (skb->remcsum_offload) skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM; return err; } EXPORT_SYMBOL(udp_gro_complete); INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); /* do fraglist only if there is no outer UDP encap (or we already processed it) */ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) { uh->len = htons(skb->len - nhoff); skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; __skb_incr_checksum_unnecessary(skb); return 0; } if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } int __init udpv4_offload_init(void) { net_hotdata.udpv4_offload = (struct net_offload) { .callbacks = { .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, }; return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } |
41 40 41 27 40 41 41 36 39 13 39 39 41 40 139 140 140 140 35 35 139 145 145 145 140 26 27 41 145 145 144 145 143 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 | /* SPDX-License-Identifier: GPL-2.0-only * * Generic bitmap / 8 bpp image bitstreamer for packed pixel framebuffers * * Rewritten by: * Copyright (C) 2025 Zsolt Kajtar (soci@c64.rulez.org) * * Based on previous work of: * Copyright (C) June 1999 James Simmons * Anton Vorontsov <avorontsov@ru.mvista.com> * Pavel Pisa <pisa@cmp.felk.cvut.cz> * Antonino A. Daplas <adaplas@gmail.com> * and others * * NOTES: * * Handles native and foreign byte order on both endians, standard and * reverse pixel order in a byte (<8 BPP), word length of 32/64 bits, * bits per pixel from 1 to the word length. Handles line lengths at byte * granularity while maintaining aligned accesses. * * Optimized routines for word aligned 1, 2, 4 pixel per word for high * bpp modes and 4 pixel at a time operation for low bpp. * * The color image is expected to be one byte per pixel, and values should * not exceed the bitdepth or the pseudo palette (if used). */ #include "fb_draw.h" /* bitmap image iterator, one pixel at a time */ struct fb_bitmap_iter { const u8 *data; unsigned long colors[2]; int width, i; }; static bool fb_bitmap_image(void *iterator, unsigned long *pixels, int *bits) { struct fb_bitmap_iter *iter = iterator; if (iter->i < iter->width) { int bit = ~iter->i & (BITS_PER_BYTE-1); int byte = iter->i++ / BITS_PER_BYTE; *pixels = iter->colors[(iter->data[byte] >> bit) & 1]; return true; } iter->data += BITS_TO_BYTES(iter->width); iter->i = 0; return false; } /* color image iterator, one pixel at a time */ struct fb_color_iter { const u8 *data; const u32 *palette; struct fb_reverse reverse; int shift; int width, i; }; static bool fb_color_image(void *iterator, unsigned long *pixels, int *bits) { struct fb_color_iter *iter = iterator; if (iter->i < iter->width) { unsigned long color = iter->data[iter->i++]; if (iter->palette) color = iter->palette[color]; *pixels = color << iter->shift; if (iter->reverse.pixel) *pixels = fb_reverse_bits_long(*pixels); return true; } iter->data += iter->width; iter->i = 0; return false; } /* bitmap image iterator, 4 pixels at a time */ struct fb_bitmap4x_iter { const u8 *data; u32 fgxcolor, bgcolor; int width, i; const u32 *expand; int bpp; bool top; }; static bool fb_bitmap4x_image(void *iterator, unsigned long *pixels, int *bits) { struct fb_bitmap4x_iter *iter = iterator; u8 data; if (iter->i >= BITS_PER_BYTE/2) { iter->i -= BITS_PER_BYTE/2; iter->top = !iter->top; if (iter->top) data = *iter->data++ >> BITS_PER_BYTE/2; else data = iter->data[-1] & ((1 << BITS_PER_BYTE/2)-1); } else if (iter->i != 0) { *bits = iter->bpp * iter->i; if (iter->top) data = iter->data[-1] & ((1 << BITS_PER_BYTE/2)-1); else data = *iter->data++ >> BITS_PER_BYTE/2; #ifndef __LITTLE_ENDIAN data >>= BITS_PER_BYTE/2 - iter->i; #endif iter->i = 0; } else { *bits = iter->bpp * BITS_PER_BYTE/2; iter->i = iter->width; iter->top = false; return false; } *pixels = (iter->fgxcolor & iter->expand[data]) ^ iter->bgcolor; #ifndef __LITTLE_ENDIAN *pixels <<= BITS_PER_LONG - *bits; #endif return true; } /* draw a line a group of pixels at a time */ static __always_inline void fb_bitblit(bool (*get)(void *iter, unsigned long *pixels, int *bits), void *iter, int bits, struct fb_address *dst, struct fb_reverse reverse) { unsigned long pixels, val, mask, old; int offset = 0; int shift = dst->bits; if (shift) { old = fb_read_offset(0, dst); val = fb_reverse_long(old, reverse); val &= ~fb_right(~0UL, shift); } else { old = 0; val = 0; } while (get(iter, &pixels, &bits)) { val |= fb_right(pixels, shift); shift += bits; if (shift < BITS_PER_LONG) continue; val = fb_reverse_long(val, reverse); fb_write_offset(val, offset++, dst); shift &= BITS_PER_LONG - 1; val = !shift ? 0 : fb_left(pixels, bits - shift); } if (shift) { mask = ~fb_pixel_mask(shift, reverse); val = fb_reverse_long(val, reverse); if (offset || !dst->bits) old = fb_read_offset(offset, dst); fb_write_offset(fb_comp(val, old, mask), offset, dst); } } /* draw a color image a pixel at a time */ static inline void fb_color_imageblit(const struct fb_image *image, struct fb_address *dst, unsigned int bits_per_line, const u32 *palette, int bpp, struct fb_reverse reverse) { struct fb_color_iter iter; u32 height; iter.data = (const u8 *)image->data; iter.palette = palette; iter.reverse = reverse; #ifdef __LITTLE_ENDIAN if (reverse.pixel) iter.shift = BITS_PER_BYTE - bpp; else iter.shift = 0; #else if (reverse.pixel) iter.shift = BITS_PER_LONG - BITS_PER_BYTE; else iter.shift = BITS_PER_LONG - bpp; #endif iter.width = image->width; iter.i = 0; height = image->height; while (height--) { fb_bitblit(fb_color_image, &iter, bpp, dst, reverse); fb_address_forward(dst, bits_per_line); } } #ifdef __LITTLE_ENDIAN #define FB_GEN(a, b) (((a)/8+(((a)&4)<<((b)-2)) \ +(((a)&2)<<((b)*2-1))+(((a)&1u)<<((b)*3)))*((1<<(b))-1)) #define FB_GEN1(a) ((a)/8+((a)&4)/2+((a)&2)*2+((a)&1)*8) #else #define FB_GEN(a, b) (((((a)/8)<<((b)*3))+(((a)&4)<<((b)*2-2)) \ +(((a)&2)<<(b-1))+((a)&1u))*((1<<(b))-1)) #define FB_GEN1(a) (a) #endif #define FB_GENx(a) { FB_GEN(0, (a)), FB_GEN(1, (a)), FB_GEN(2, (a)), FB_GEN(3, (a)), \ FB_GEN(4, (a)), FB_GEN(5, (a)), FB_GEN(6, (a)), FB_GEN(7, (a)), \ FB_GEN(8, (a)), FB_GEN(9, (a)), FB_GEN(10, (a)), FB_GEN(11, (a)), \ FB_GEN(12, (a)), FB_GEN(13, (a)), FB_GEN(14, (a)), FB_GEN(15, (a)) } /* draw a 2 color image four pixels at a time (for 1-8 bpp only) */ static inline void fb_bitmap4x_imageblit(const struct fb_image *image, struct fb_address *dst, unsigned long fgcolor, unsigned long bgcolor, int bpp, unsigned int bits_per_line, struct fb_reverse reverse) { static const u32 mul[BITS_PER_BYTE] = { 0xf, 0x55, 0x249, 0x1111, 0x8421, 0x41041, 0x204081, 0x01010101 }; static const u32 expand[BITS_PER_BYTE][1 << 4] = { { FB_GEN1(0), FB_GEN1(1), FB_GEN1(2), FB_GEN1(3), FB_GEN1(4), FB_GEN1(5), FB_GEN1(6), FB_GEN1(7), FB_GEN1(8), FB_GEN1(9), FB_GEN1(10), FB_GEN1(11), FB_GEN1(12), FB_GEN1(13), FB_GEN1(14), FB_GEN1(15) }, FB_GENx(2), FB_GENx(3), FB_GENx(4), FB_GENx(5), FB_GENx(6), FB_GENx(7), FB_GENx(8), }; struct fb_bitmap4x_iter iter; u32 height; iter.data = (const u8 *)image->data; if (reverse.pixel) { fgcolor = fb_reverse_bits_long(fgcolor << (BITS_PER_BYTE - bpp)); bgcolor = fb_reverse_bits_long(bgcolor << (BITS_PER_BYTE - bpp)); } iter.fgxcolor = (fgcolor ^ bgcolor) * mul[bpp-1]; iter.bgcolor = bgcolor * mul[bpp-1]; iter.width = image->width; iter.i = image->width; iter.expand = expand[bpp-1]; iter.bpp = bpp; iter.top = false; height = image->height; while (height--) { fb_bitblit(fb_bitmap4x_image, &iter, bpp * BITS_PER_BYTE/2, dst, reverse); fb_address_forward(dst, bits_per_line); } } /* draw a bitmap image 1 pixel at a time (for >8 bpp) */ static inline void fb_bitmap1x_imageblit(const struct fb_image *image, struct fb_address *dst, unsigned long fgcolor, unsigned long bgcolor, int bpp, unsigned int bits_per_line, struct fb_reverse reverse) { struct fb_bitmap_iter iter; u32 height; iter.colors[0] = bgcolor; iter.colors[1] = fgcolor; #ifndef __LITTLE_ENDIAN iter.colors[0] <<= BITS_PER_LONG - bpp; iter.colors[1] <<= BITS_PER_LONG - bpp; #endif iter.data = (const u8 *)image->data; iter.width = image->width; iter.i = 0; height = image->height; while (height--) { fb_bitblit(fb_bitmap_image, &iter, bpp, dst, reverse); fb_address_forward(dst, bits_per_line); } } /* one pixel per word, 64/32 bpp blitting */ static inline void fb_bitmap_1ppw(const struct fb_image *image, struct fb_address *dst, unsigned long fgcolor, unsigned long bgcolor, int words_per_line, struct fb_reverse reverse) { unsigned long tab[2]; const u8 *src = (u8 *)image->data; int width = image->width; int offset; u32 height; if (reverse.byte) { tab[0] = swab_long(bgcolor); tab[1] = swab_long(fgcolor); } else { tab[0] = bgcolor; tab[1] = fgcolor; } height = image->height; while (height--) { for (offset = 0; offset + 8 <= width; offset += 8) { unsigned int srcbyte = *src++; fb_write_offset(tab[(srcbyte >> 7) & 1], offset + 0, dst); fb_write_offset(tab[(srcbyte >> 6) & 1], offset + 1, dst); fb_write_offset(tab[(srcbyte >> 5) & 1], offset + 2, dst); fb_write_offset(tab[(srcbyte >> 4) & 1], offset + 3, dst); fb_write_offset(tab[(srcbyte >> 3) & 1], offset + 4, dst); fb_write_offset(tab[(srcbyte >> 2) & 1], offset + 5, dst); fb_write_offset(tab[(srcbyte >> 1) & 1], offset + 6, dst); fb_write_offset(tab[(srcbyte >> 0) & 1], offset + 7, dst); } if (offset < width) { unsigned int srcbyte = *src++; while (offset < width) { fb_write_offset(tab[(srcbyte >> 7) & 1], offset, dst); srcbyte <<= 1; offset++; } } fb_address_move_long(dst, words_per_line); } } static inline unsigned long fb_pack(unsigned long left, unsigned long right, int bits) { #ifdef __LITTLE_ENDIAN return left | right << bits; #else return right | left << bits; #endif } /* aligned 32/16 bpp blitting */ static inline void fb_bitmap_2ppw(const struct fb_image *image, struct fb_address *dst, unsigned long fgcolor, unsigned long bgcolor, int words_per_line, struct fb_reverse reverse) { unsigned long tab[4]; const u8 *src = (u8 *)image->data; int width = image->width / 2; int offset; u32 height; tab[0] = fb_pack(bgcolor, bgcolor, BITS_PER_LONG/2); tab[1] = fb_pack(bgcolor, fgcolor, BITS_PER_LONG/2); tab[2] = fb_pack(fgcolor, bgcolor, BITS_PER_LONG/2); tab[3] = fb_pack(fgcolor, fgcolor, BITS_PER_LONG/2); if (reverse.byte) { tab[0] = swab_long(tab[0]); tab[1] = swab_long(tab[1]); tab[2] = swab_long(tab[2]); tab[3] = swab_long(tab[3]); } height = image->height; while (height--) { for (offset = 0; offset + 4 <= width; offset += 4) { unsigned int srcbyte = *src++; fb_write_offset(tab[(srcbyte >> 6) & 3], offset + 0, dst); fb_write_offset(tab[(srcbyte >> 4) & 3], offset + 1, dst); fb_write_offset(tab[(srcbyte >> 2) & 3], offset + 2, dst); fb_write_offset(tab[(srcbyte >> 0) & 3], offset + 3, dst); } if (offset < width) { unsigned int srcbyte = *src++; while (offset < width) { fb_write_offset(tab[(srcbyte >> 6) & 3], offset, dst); srcbyte <<= 2; offset++; } } fb_address_move_long(dst, words_per_line); } } #define FB_PATP(a, b) (((a)<<((b)*BITS_PER_LONG/4))*((1UL<<BITS_PER_LONG/4)-1UL)) #define FB_PAT4(a) (FB_PATP((a)&1, 0)|FB_PATP(((a)&2)/2, 1)| \ FB_PATP(((a)&4)/4, 2)|FB_PATP(((a)&8)/8, 3)) /* aligned 16/8 bpp blitting */ static inline void fb_bitmap_4ppw(const struct fb_image *image, struct fb_address *dst, unsigned long fgcolor, unsigned long bgcolor, int words_per_line, struct fb_reverse reverse) { static const unsigned long tab16_be[] = { 0, FB_PAT4(1UL), FB_PAT4(2UL), FB_PAT4(3UL), FB_PAT4(4UL), FB_PAT4(5UL), FB_PAT4(6UL), FB_PAT4(7UL), FB_PAT4(8UL), FB_PAT4(9UL), FB_PAT4(10UL), FB_PAT4(11UL), FB_PAT4(12UL), FB_PAT4(13UL), FB_PAT4(14UL), ~0UL }; static const unsigned long tab16_le[] = { 0, FB_PAT4(8UL), FB_PAT4(4UL), FB_PAT4(12UL), FB_PAT4(2UL), FB_PAT4(10UL), FB_PAT4(6UL), FB_PAT4(14UL), FB_PAT4(1UL), FB_PAT4(9UL), FB_PAT4(5UL), FB_PAT4(13UL), FB_PAT4(3UL), FB_PAT4(11UL), FB_PAT4(7UL), ~0UL }; const unsigned long *tab; const u8 *src = (u8 *)image->data; int width = image->width / 4; int offset; u32 height; fgcolor = fgcolor | fgcolor << BITS_PER_LONG/4; bgcolor = bgcolor | bgcolor << BITS_PER_LONG/4; fgcolor = fgcolor | fgcolor << BITS_PER_LONG/2; bgcolor = bgcolor | bgcolor << BITS_PER_LONG/2; fgcolor ^= bgcolor; if (BITS_PER_LONG/4 > BITS_PER_BYTE && reverse.byte) { fgcolor = swab_long(fgcolor); bgcolor = swab_long(bgcolor); } #ifdef __LITTLE_ENDIAN tab = reverse.byte ? tab16_be : tab16_le; #else tab = reverse.byte ? tab16_le : tab16_be; #endif height = image->height; while (height--) { for (offset = 0; offset + 2 <= width; offset += 2, src++) { fb_write_offset((fgcolor & tab[*src >> 4]) ^ bgcolor, offset + 0, dst); fb_write_offset((fgcolor & tab[*src & 0xf]) ^ bgcolor, offset + 1, dst); } if (offset < width) fb_write_offset((fgcolor & tab[*src++ >> 4]) ^ bgcolor, offset, dst); fb_address_move_long(dst, words_per_line); } } static inline void fb_bitmap_imageblit(const struct fb_image *image, struct fb_address *dst, unsigned int bits_per_line, const u32 *palette, int bpp, struct fb_reverse reverse) { unsigned long fgcolor, bgcolor; if (palette) { fgcolor = palette[image->fg_color]; bgcolor = palette[image->bg_color]; } else { fgcolor = image->fg_color; bgcolor = image->bg_color; } if (!dst->bits && !(bits_per_line & (BITS_PER_LONG-1))) { if (bpp == BITS_PER_LONG && BITS_PER_LONG == 32) { fb_bitmap_1ppw(image, dst, fgcolor, bgcolor, bits_per_line / BITS_PER_LONG, reverse); return; } if (bpp == BITS_PER_LONG/2 && !(image->width & 1)) { fb_bitmap_2ppw(image, dst, fgcolor, bgcolor, bits_per_line / BITS_PER_LONG, reverse); return; } if (bpp == BITS_PER_LONG/4 && !(image->width & 3)) { fb_bitmap_4ppw(image, dst, fgcolor, bgcolor, bits_per_line / BITS_PER_LONG, reverse); return; } } if (bpp > 0 && bpp <= BITS_PER_BYTE) fb_bitmap4x_imageblit(image, dst, fgcolor, bgcolor, bpp, bits_per_line, reverse); else if (bpp > BITS_PER_BYTE && bpp <= BITS_PER_LONG) fb_bitmap1x_imageblit(image, dst, fgcolor, bgcolor, bpp, bits_per_line, reverse); } static inline void fb_imageblit(struct fb_info *p, const struct fb_image *image) { int bpp = p->var.bits_per_pixel; unsigned int bits_per_line = BYTES_TO_BITS(p->fix.line_length); struct fb_address dst = fb_address_init(p); struct fb_reverse reverse = fb_reverse_init(p); const u32 *palette = fb_palette(p); fb_address_forward(&dst, image->dy * bits_per_line + image->dx * bpp); if (image->depth == 1) fb_bitmap_imageblit(image, &dst, bits_per_line, palette, bpp, reverse); else fb_color_imageblit(image, &dst, bits_per_line, palette, bpp, reverse); } |
2 2 2 2 4 2 2 2 4 3 3 3 1 3 3 2 2 2 2 2 2 2 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 6 5 5 5 2 2 5 5 4 5 5 5 4 4 4 4 3 3 1 5 4 2 2 2 2 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * Copyright (c) Nokia Corporation, 2007 * * Author: Artem Bityutskiy (Битюцкий Артём), * Frank Haverkamp */ /* * This file includes UBI initialization and building of UBI devices. * * When UBI is initialized, it attaches all the MTD devices specified as the * module load parameters or the kernel boot parameters. If MTD devices were * specified, UBI does not attach any MTD device, but it is possible to do * later using the "UBI control device". */ #include <linux/err.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/stringify.h> #include <linux/namei.h> #include <linux/stat.h> #include <linux/miscdevice.h> #include <linux/mtd/partitions.h> #include <linux/log2.h> #include <linux/kthread.h> #include <linux/kernel.h> #include <linux/of.h> #include <linux/slab.h> #include <linux/major.h> #include "ubi.h" /* Maximum length of the 'mtd=' parameter */ #define MTD_PARAM_LEN_MAX 64 /* Maximum number of comma-separated items in the 'mtd=' parameter */ #define MTD_PARAM_MAX_COUNT 6 /* Maximum value for the number of bad PEBs per 1024 PEBs */ #define MAX_MTD_UBI_BEB_LIMIT 768 #ifdef CONFIG_MTD_UBI_MODULE #define ubi_is_module() 1 #else #define ubi_is_module() 0 #endif /** * struct mtd_dev_param - MTD device parameter description data structure. * @name: MTD character device node path, MTD device name, or MTD device number * string * @ubi_num: UBI number * @vid_hdr_offs: VID header offset * @max_beb_per1024: maximum expected number of bad PEBs per 1024 PEBs * @enable_fm: enable fastmap when value is non-zero * @need_resv_pool: reserve pool->max_size pebs when value is none-zero */ struct mtd_dev_param { char name[MTD_PARAM_LEN_MAX]; int ubi_num; int vid_hdr_offs; int max_beb_per1024; int enable_fm; int need_resv_pool; }; /* Numbers of elements set in the @mtd_dev_param array */ static int mtd_devs; /* MTD devices specification parameters */ static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES]; #ifdef CONFIG_MTD_UBI_FASTMAP /* UBI module parameter to enable fastmap automatically on non-fastmap images */ static bool fm_autoconvert; static bool fm_debug; #endif /* Slab cache for wear-leveling entries */ struct kmem_cache *ubi_wl_entry_slab; /* UBI control character device */ static struct miscdevice ubi_ctrl_cdev = { .minor = MISC_DYNAMIC_MINOR, .name = "ubi_ctrl", .fops = &ubi_ctrl_cdev_operations, }; /* All UBI devices in system */ static struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; /* Serializes UBI devices creations and removals */ DEFINE_MUTEX(ubi_devices_mutex); /* Protects @ubi_devices, @ubi->ref_count and @ubi->is_dead */ static DEFINE_SPINLOCK(ubi_devices_lock); /* "Show" method for files in '/<sysfs>/class/ubi/' */ /* UBI version attribute ('/<sysfs>/class/ubi/version') */ static ssize_t version_show(const struct class *class, const struct class_attribute *attr, char *buf) { return sprintf(buf, "%d\n", UBI_VERSION); } static CLASS_ATTR_RO(version); static struct attribute *ubi_class_attrs[] = { &class_attr_version.attr, NULL, }; ATTRIBUTE_GROUPS(ubi_class); /* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */ const struct class ubi_class = { .name = UBI_NAME_STR, .class_groups = ubi_class_groups, }; static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf); /* UBI device attributes (correspond to files in '/<sysfs>/class/ubi/ubiX') */ static struct device_attribute dev_eraseblock_size = __ATTR(eraseblock_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_avail_eraseblocks = __ATTR(avail_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_total_eraseblocks = __ATTR(total_eraseblocks, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_volumes_count = __ATTR(volumes_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_ec = __ATTR(max_ec, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_reserved_for_bad = __ATTR(reserved_for_bad, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bad_peb_count = __ATTR(bad_peb_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_max_vol_count = __ATTR(max_vol_count, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_min_io_size = __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bgt_enabled = __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_mtd_num = __ATTR(mtd_num, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_ro_mode = __ATTR(ro_mode, S_IRUGO, dev_attribute_show, NULL); /** * ubi_volume_notify - send a volume change notification. * @ubi: UBI device description object * @vol: volume description object of the changed volume * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * * This is a helper function which notifies all subscribers about a volume * change event (creation, removal, re-sizing, re-naming, updating). Returns * zero in case of success and a negative error code in case of failure. */ int ubi_volume_notify(struct ubi_device *ubi, struct ubi_volume *vol, int ntype) { int ret; struct ubi_notification nt; ubi_do_get_device_info(ubi, &nt.di); ubi_do_get_volume_info(ubi, vol, &nt.vi); switch (ntype) { case UBI_VOLUME_ADDED: case UBI_VOLUME_REMOVED: case UBI_VOLUME_RESIZED: case UBI_VOLUME_RENAMED: ret = ubi_update_fastmap(ubi); if (ret) ubi_msg(ubi, "Unable to write a new fastmap: %i", ret); } return blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); } /** * ubi_notify_all - send a notification to all volumes. * @ubi: UBI device description object * @ntype: notification type to send (%UBI_VOLUME_ADDED, etc) * @nb: the notifier to call * * This function walks all volumes of UBI device @ubi and sends the @ntype * notification for each volume. If @nb is %NULL, then all registered notifiers * are called, otherwise only the @nb notifier is called. Returns the number of * sent notifications. */ int ubi_notify_all(struct ubi_device *ubi, int ntype, struct notifier_block *nb) { struct ubi_notification nt; int i, count = 0; ubi_do_get_device_info(ubi, &nt.di); mutex_lock(&ubi->device_mutex); for (i = 0; i < ubi->vtbl_slots; i++) { /* * Since the @ubi->device is locked, and we are not going to * change @ubi->volumes, we do not have to lock * @ubi->volumes_lock. */ if (!ubi->volumes[i]) continue; ubi_do_get_volume_info(ubi, ubi->volumes[i], &nt.vi); if (nb) nb->notifier_call(nb, ntype, &nt); else blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt); count += 1; } mutex_unlock(&ubi->device_mutex); return count; } /** * ubi_enumerate_volumes - send "add" notification for all existing volumes. * @nb: the notifier to call * * This function walks all UBI devices and volumes and sends the * %UBI_VOLUME_ADDED notification for each volume. If @nb is %NULL, then all * registered notifiers are called, otherwise only the @nb notifier is called. * Returns the number of sent notifications. */ int ubi_enumerate_volumes(struct notifier_block *nb) { int i, count = 0; /* * Since the @ubi_devices_mutex is locked, and we are not going to * change @ubi_devices, we do not have to lock @ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (!ubi) continue; count += ubi_notify_all(ubi, UBI_VOLUME_ADDED, nb); } return count; } /** * ubi_get_device - get UBI device. * @ubi_num: UBI device number * * This function returns UBI device description object for UBI device number * @ubi_num, or %NULL if the device does not exist. This function increases the * device reference count to prevent removal of the device. In other words, the * device cannot be removed if its reference count is not zero. */ struct ubi_device *ubi_get_device(int ubi_num) { struct ubi_device *ubi; spin_lock(&ubi_devices_lock); ubi = ubi_devices[ubi_num]; if (ubi && ubi->is_dead) ubi = NULL; if (ubi) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); } spin_unlock(&ubi_devices_lock); return ubi; } /** * ubi_put_device - drop an UBI device reference. * @ubi: UBI device description object */ void ubi_put_device(struct ubi_device *ubi) { spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; put_device(&ubi->dev); spin_unlock(&ubi_devices_lock); } /** * ubi_get_by_major - get UBI device by character device major number. * @major: major number * * This function is similar to 'ubi_get_device()', but it searches the device * by its major number. */ struct ubi_device *ubi_get_by_major(int major) { int i; struct ubi_device *ubi; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); spin_unlock(&ubi_devices_lock); return ubi; } } spin_unlock(&ubi_devices_lock); return NULL; } /** * ubi_major2num - get UBI device number by character device major number. * @major: major number * * This function searches UBI device number object by its major number. If UBI * device was not found, this function returns -ENODEV, otherwise the UBI device * number is returned. */ int ubi_major2num(int major) { int i, ubi_num = -ENODEV; spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_num = ubi->ubi_num; break; } } spin_unlock(&ubi_devices_lock); return ubi_num; } /* "Show" method for files in '/<sysfs>/class/ubi/ubiX/' */ static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf) { ssize_t ret; struct ubi_device *ubi; /* * The below code looks weird, but it actually makes sense. We get the * UBI device reference from the contained 'struct ubi_device'. But it * is unclear if the device was removed or not yet. Indeed, if the * device was removed before we increased its reference count, * 'ubi_get_device()' will return -ENODEV and we fail. * * Remember, 'struct ubi_device' is freed in the release function, so * we still can use 'ubi->ubi_num'. */ ubi = container_of(dev, struct ubi_device, dev); if (attr == &dev_eraseblock_size) ret = sprintf(buf, "%d\n", ubi->leb_size); else if (attr == &dev_avail_eraseblocks) ret = sprintf(buf, "%d\n", ubi->avail_pebs); else if (attr == &dev_total_eraseblocks) ret = sprintf(buf, "%d\n", ubi->good_peb_count); else if (attr == &dev_volumes_count) ret = sprintf(buf, "%d\n", ubi->vol_count - UBI_INT_VOL_COUNT); else if (attr == &dev_max_ec) ret = sprintf(buf, "%d\n", ubi->max_ec); else if (attr == &dev_reserved_for_bad) ret = sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); else if (attr == &dev_bad_peb_count) ret = sprintf(buf, "%d\n", ubi->bad_peb_count); else if (attr == &dev_max_vol_count) ret = sprintf(buf, "%d\n", ubi->vtbl_slots); else if (attr == &dev_min_io_size) ret = sprintf(buf, "%d\n", ubi->min_io_size); else if (attr == &dev_bgt_enabled) ret = sprintf(buf, "%d\n", ubi->thread_enabled); else if (attr == &dev_mtd_num) ret = sprintf(buf, "%d\n", ubi->mtd->index); else if (attr == &dev_ro_mode) ret = sprintf(buf, "%d\n", ubi->ro_mode); else ret = -EINVAL; return ret; } static struct attribute *ubi_dev_attrs[] = { &dev_eraseblock_size.attr, &dev_avail_eraseblocks.attr, &dev_total_eraseblocks.attr, &dev_volumes_count.attr, &dev_max_ec.attr, &dev_reserved_for_bad.attr, &dev_bad_peb_count.attr, &dev_max_vol_count.attr, &dev_min_io_size.attr, &dev_bgt_enabled.attr, &dev_mtd_num.attr, &dev_ro_mode.attr, NULL }; ATTRIBUTE_GROUPS(ubi_dev); static void dev_release(struct device *dev) { struct ubi_device *ubi = container_of(dev, struct ubi_device, dev); kfree(ubi); } /** * kill_volumes - destroy all user volumes. * @ubi: UBI device description object */ static void kill_volumes(struct ubi_device *ubi) { int i; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) ubi_free_volume(ubi, ubi->volumes[i]); } /** * uif_init - initialize user interfaces for an UBI device. * @ubi: UBI device description object * * This function initializes various user interfaces for an UBI device. If the * initialization fails at an early stage, this function frees all the * resources it allocated, returns an error. * * This function returns zero in case of success and a negative error code in * case of failure. */ static int uif_init(struct ubi_device *ubi) { int i, err; dev_t dev; sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num); /* * Major numbers for the UBI character devices are allocated * dynamically. Major numbers of volume character devices are * equivalent to ones of the corresponding UBI character device. Minor * numbers of UBI character devices are 0, while minor numbers of * volume character devices start from 1. Thus, we allocate one major * number and ubi->vtbl_slots + 1 minor numbers. */ err = alloc_chrdev_region(&dev, 0, ubi->vtbl_slots + 1, ubi->ubi_name); if (err) { ubi_err(ubi, "cannot register UBI character devices"); return err; } ubi->dev.devt = dev; ubi_assert(MINOR(dev) == 0); cdev_init(&ubi->cdev, &ubi_cdev_operations); dbg_gen("%s major is %u", ubi->ubi_name, MAJOR(dev)); ubi->cdev.owner = THIS_MODULE; dev_set_name(&ubi->dev, UBI_NAME_STR "%d", ubi->ubi_num); err = cdev_device_add(&ubi->cdev, &ubi->dev); if (err) goto out_unreg; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) { err = ubi_add_volume(ubi, ubi->volumes[i]); if (err) { ubi_err(ubi, "cannot add volume %d", i); ubi->volumes[i] = NULL; goto out_volumes; } } return 0; out_volumes: kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); out_unreg: unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); ubi_err(ubi, "cannot initialize UBI %s, error %d", ubi->ubi_name, err); return err; } /** * uif_close - close user interfaces for an UBI device. * @ubi: UBI device description object * * Note, since this function un-registers UBI volume device objects (@vol->dev), * the memory allocated voe the volumes is freed as well (in the release * function). */ static void uif_close(struct ubi_device *ubi) { kill_volumes(ubi); cdev_device_del(&ubi->cdev, &ubi->dev); unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); } /** * ubi_free_volumes_from - free volumes from specific index. * @ubi: UBI device description object * @from: the start index used for volume free. */ static void ubi_free_volumes_from(struct ubi_device *ubi, int from) { int i; for (i = from; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { if (!ubi->volumes[i] || ubi->volumes[i]->is_dead) continue; ubi_eba_replace_table(ubi->volumes[i], NULL); ubi_fastmap_destroy_checkmap(ubi->volumes[i]); kfree(ubi->volumes[i]); ubi->volumes[i] = NULL; } } /** * ubi_free_all_volumes - free all volumes. * @ubi: UBI device description object */ void ubi_free_all_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, 0); } /** * ubi_free_internal_volumes - free internal volumes. * @ubi: UBI device description object */ void ubi_free_internal_volumes(struct ubi_device *ubi) { ubi_free_volumes_from(ubi, ubi->vtbl_slots); } static int get_bad_peb_limit(const struct ubi_device *ubi, int max_beb_per1024) { int limit, device_pebs; uint64_t device_size; if (!max_beb_per1024) { /* * Since max_beb_per1024 has not been set by the user in either * the cmdline or Kconfig, use mtd_max_bad_blocks to set the * limit if it is supported by the device. */ limit = mtd_max_bad_blocks(ubi->mtd, 0, ubi->mtd->size); if (limit < 0) return 0; return limit; } /* * Here we are using size of the entire flash chip and * not just the MTD partition size because the maximum * number of bad eraseblocks is a percentage of the * whole device and bad eraseblocks are not fairly * distributed over the flash chip. So the worst case * is that all the bad eraseblocks of the chip are in * the MTD partition we are attaching (ubi->mtd). */ device_size = mtd_get_device_size(ubi->mtd); device_pebs = mtd_div_by_eb(device_size, ubi->mtd); limit = mult_frac(device_pebs, max_beb_per1024, 1024); /* Round it up */ if (mult_frac(limit, 1024, max_beb_per1024) < device_pebs) limit += 1; return limit; } /** * io_init - initialize I/O sub-system for a given UBI device. * @ubi: UBI device description object * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are * assumed: * o EC header is always at offset zero - this cannot be changed; * o VID header starts just after the EC header at the closest address * aligned to @io->hdrs_min_io_size; * o data starts just after the VID header at the closest address aligned to * @io->min_io_size * * This function returns zero in case of success and a negative error code in * case of failure. */ static int io_init(struct ubi_device *ubi, int max_beb_per1024) { dbg_gen("sizeof(struct ubi_ainf_peb) %zu", sizeof(struct ubi_ainf_peb)); dbg_gen("sizeof(struct ubi_wl_entry) %zu", sizeof(struct ubi_wl_entry)); if (ubi->mtd->numeraseregions != 0) { /* * Some flashes have several erase regions. Different regions * may have different eraseblock size and other * characteristics. It looks like mostly multi-region flashes * have one "main" region and one or more small regions to * store boot loader code or boot parameters or whatever. I * guess we should just pick the largest region. But this is * not implemented. */ ubi_err(ubi, "multiple regions, not implemented"); return -EINVAL; } if (ubi->vid_hdr_offset < 0) return -EINVAL; /* * Note, in this implementation we support MTD devices with 0x7FFFFFFF * physical eraseblocks maximum. */ ubi->peb_size = ubi->mtd->erasesize; ubi->peb_count = mtd_div_by_eb(ubi->mtd->size, ubi->mtd); ubi->flash_size = ubi->mtd->size; if (mtd_can_have_bb(ubi->mtd)) { ubi->bad_allowed = 1; ubi->bad_peb_limit = get_bad_peb_limit(ubi, max_beb_per1024); } if (ubi->mtd->type == MTD_NORFLASH) ubi->nor_flash = 1; ubi->min_io_size = ubi->mtd->writesize; ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft; /* * Make sure minimal I/O unit is power of 2. Note, there is no * fundamental reason for this assumption. It is just an optimization * which allows us to avoid costly division operations. */ if (!is_power_of_2(ubi->min_io_size)) { ubi_err(ubi, "min. I/O unit (%d) is not power of 2", ubi->min_io_size); return -EINVAL; } ubi_assert(ubi->hdrs_min_io_size > 0); ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size); ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0); ubi->max_write_size = ubi->mtd->writebufsize; /* * Maximum write size has to be greater or equivalent to min. I/O * size, and be multiple of min. I/O size. */ if (ubi->max_write_size < ubi->min_io_size || ubi->max_write_size % ubi->min_io_size || !is_power_of_2(ubi->max_write_size)) { ubi_err(ubi, "bad write buffer size %d for %d min. I/O unit", ubi->max_write_size, ubi->min_io_size); return -EINVAL; } /* Calculate default aligned sizes of EC and VID headers */ ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); dbg_gen("min_io_size %d", ubi->min_io_size); dbg_gen("max_write_size %d", ubi->max_write_size); dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size); dbg_gen("ec_hdr_alsize %d", ubi->ec_hdr_alsize); dbg_gen("vid_hdr_alsize %d", ubi->vid_hdr_alsize); if (ubi->vid_hdr_offset == 0) /* Default offset */ ubi->vid_hdr_offset = ubi->vid_hdr_aloffset = ubi->ec_hdr_alsize; else { ubi->vid_hdr_aloffset = ubi->vid_hdr_offset & ~(ubi->hdrs_min_io_size - 1); ubi->vid_hdr_shift = ubi->vid_hdr_offset - ubi->vid_hdr_aloffset; } /* * Memory allocation for VID header is ubi->vid_hdr_alsize * which is described in comments in io.c. * Make sure VID header shift + UBI_VID_HDR_SIZE not exceeds * ubi->vid_hdr_alsize, so that all vid header operations * won't access memory out of bounds. */ if ((ubi->vid_hdr_shift + UBI_VID_HDR_SIZE) > ubi->vid_hdr_alsize) { ubi_err(ubi, "Invalid VID header offset %d, VID header shift(%d)" " + VID header size(%zu) > VID header aligned size(%d).", ubi->vid_hdr_offset, ubi->vid_hdr_shift, UBI_VID_HDR_SIZE, ubi->vid_hdr_alsize); return -EINVAL; } /* Similar for the data offset */ ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE; ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); dbg_gen("vid_hdr_offset %d", ubi->vid_hdr_offset); dbg_gen("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset); dbg_gen("vid_hdr_shift %d", ubi->vid_hdr_shift); dbg_gen("leb_start %d", ubi->leb_start); /* The shift must be aligned to 32-bit boundary */ if (ubi->vid_hdr_shift % 4) { ubi_err(ubi, "unaligned VID header shift %d", ubi->vid_hdr_shift); return -EINVAL; } /* Check sanity */ if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE || ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE || ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE || ubi->leb_start & (ubi->min_io_size - 1)) { ubi_err(ubi, "bad VID header (%d) or data offsets (%d)", ubi->vid_hdr_offset, ubi->leb_start); return -EINVAL; } /* * Set maximum amount of physical erroneous eraseblocks to be 10%. * Erroneous PEB are those which have read errors. */ ubi->max_erroneous = ubi->peb_count / 10; if (ubi->max_erroneous < 16) ubi->max_erroneous = 16; dbg_gen("max_erroneous %d", ubi->max_erroneous); /* * It may happen that EC and VID headers are situated in one minimal * I/O unit. In this case we can only accept this UBI image in * read-only mode. */ if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) { ubi_warn(ubi, "EC and VID headers are in the same minimal I/O unit, switch to read-only mode"); ubi->ro_mode = 1; } ubi->leb_size = ubi->peb_size - ubi->leb_start; if (!(ubi->mtd->flags & MTD_WRITEABLE)) { ubi_msg(ubi, "MTD device %d is write-protected, attach in read-only mode", ubi->mtd->index); ubi->ro_mode = 1; } /* * Note, ideally, we have to initialize @ubi->bad_peb_count here. But * unfortunately, MTD does not provide this information. We should loop * over all physical eraseblocks and invoke mtd->block_is_bad() for * each physical eraseblock. So, we leave @ubi->bad_peb_count * uninitialized so far. */ return 0; } /** * autoresize - re-size the volume which has the "auto-resize" flag set. * @ubi: UBI device description object * @vol_id: ID of the volume to re-size * * This function re-sizes the volume marked by the %UBI_VTBL_AUTORESIZE_FLG in * the volume table to the largest possible size. See comments in ubi-header.h * for more description of the flag. Returns zero in case of success and a * negative error code in case of failure. */ static int autoresize(struct ubi_device *ubi, int vol_id) { struct ubi_volume_desc desc; struct ubi_volume *vol = ubi->volumes[vol_id]; int err, old_reserved_pebs = vol->reserved_pebs; if (ubi->ro_mode) { ubi_warn(ubi, "skip auto-resize because of R/O mode"); return 0; } /* * Clear the auto-resize flag in the volume in-memory copy of the * volume table, and 'ubi_resize_volume()' will propagate this change * to the flash. */ ubi->vtbl[vol_id].flags &= ~UBI_VTBL_AUTORESIZE_FLG; if (ubi->avail_pebs == 0) { struct ubi_vtbl_record vtbl_rec; /* * No available PEBs to re-size the volume, clear the flag on * flash and exit. */ vtbl_rec = ubi->vtbl[vol_id]; err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); if (err) ubi_err(ubi, "cannot clean auto-resize flag for volume %d", vol_id); } else { desc.vol = vol; err = ubi_resize_volume(&desc, old_reserved_pebs + ubi->avail_pebs); if (err) ubi_err(ubi, "cannot auto-resize volume %d", vol_id); } if (err) return err; ubi_msg(ubi, "volume %d (\"%s\") re-sized from %d to %d LEBs", vol_id, vol->name, old_reserved_pebs, vol->reserved_pebs); return 0; } /** * ubi_attach_mtd_dev - attach an MTD device. * @mtd: MTD device description object * @ubi_num: number to assign to the new UBI device * @vid_hdr_offset: VID header offset * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs * @disable_fm: whether disable fastmap * @need_resv_pool: whether reserve pebs to fill fm_pool * * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in * which case this function finds a vacant device number and assigns it * automatically. Returns the new UBI device number in case of success and a * negative error code in case of failure. * * If @disable_fm is true, ubi doesn't create new fastmap even the module param * 'fm_autoconvert' is set, and existed old fastmap will be destroyed after * doing full scanning. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset, int max_beb_per1024, bool disable_fm, bool need_resv_pool) { struct ubi_device *ubi; int i, err; if (max_beb_per1024 < 0 || max_beb_per1024 > MAX_MTD_UBI_BEB_LIMIT) return -EINVAL; if (!max_beb_per1024) max_beb_per1024 = CONFIG_MTD_UBI_BEB_LIMIT; /* * Check if we already have the same MTD device attached. * * Note, this function assumes that UBI devices creations and deletions * are serialized, so it does not take the &ubi_devices_lock. */ for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; if (ubi && mtd->index == ubi->mtd->index) { pr_err("ubi: mtd%d is already attached to ubi%d\n", mtd->index, i); return -EEXIST; } } /* * Make sure this MTD device is not emulated on top of an UBI volume * already. Well, generally this recursion works fine, but there are * different problems like the UBI module takes a reference to itself * by attaching (and thus, opening) the emulated MTD device. This * results in inability to unload the module. And in general it makes * no sense to attach emulated MTD devices, so we prohibit this. */ if (mtd->type == MTD_UBIVOLUME) { pr_err("ubi: refuse attaching mtd%d - it is already emulated on top of UBI\n", mtd->index); return -EINVAL; } /* * Both UBI and UBIFS have been designed for SLC NAND and NOR flashes. * MLC NAND is different and needs special care, otherwise UBI or UBIFS * will die soon and you will lose all your data. * Relax this rule if the partition we're attaching to operates in SLC * mode. */ if (mtd->type == MTD_MLCNANDFLASH && !(mtd->flags & MTD_SLC_ON_MLC_EMULATION)) { pr_err("ubi: refuse attaching mtd%d - MLC NAND is not supported\n", mtd->index); return -EINVAL; } /* UBI cannot work on flashes with zero erasesize. */ if (!mtd->erasesize) { pr_err("ubi: refuse attaching mtd%d - zero erasesize flash is not supported\n", mtd->index); return -EINVAL; } if (ubi_num == UBI_DEV_NUM_AUTO) { /* Search for an empty slot in the @ubi_devices array */ for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) if (!ubi_devices[ubi_num]) break; if (ubi_num == UBI_MAX_DEVICES) { pr_err("ubi: only %d UBI devices may be created\n", UBI_MAX_DEVICES); return -ENFILE; } } else { if (ubi_num >= UBI_MAX_DEVICES) return -EINVAL; /* Make sure ubi_num is not busy */ if (ubi_devices[ubi_num]) { pr_err("ubi: ubi%i already exists\n", ubi_num); return -EEXIST; } } ubi = kzalloc(sizeof(struct ubi_device), GFP_KERNEL); if (!ubi) return -ENOMEM; device_initialize(&ubi->dev); ubi->dev.release = dev_release; ubi->dev.class = &ubi_class; ubi->dev.groups = ubi_dev_groups; ubi->dev.parent = &mtd->dev; ubi->mtd = mtd; ubi->ubi_num = ubi_num; ubi->vid_hdr_offset = vid_hdr_offset; ubi->autoresize_vol_id = -1; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_pool.used = ubi->fm_pool.size = 0; ubi->fm_wl_pool.used = ubi->fm_wl_pool.size = 0; /* * fm_pool.max_size is 5% of the total number of PEBs but it's also * between UBI_FM_MAX_POOL_SIZE and UBI_FM_MIN_POOL_SIZE. */ ubi->fm_pool.max_size = min(((int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) / 100) * 5, UBI_FM_MAX_POOL_SIZE); ubi->fm_pool.max_size = max(ubi->fm_pool.max_size, UBI_FM_MIN_POOL_SIZE); ubi->fm_wl_pool.max_size = ubi->fm_pool.max_size / 2; ubi->fm_pool_rsv_cnt = need_resv_pool ? ubi->fm_pool.max_size : 0; ubi->fm_disabled = (!fm_autoconvert || disable_fm) ? 1 : 0; if (fm_debug) ubi_enable_dbg_chk_fastmap(ubi); if (!ubi->fm_disabled && (int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd) <= UBI_FM_MAX_START) { ubi_err(ubi, "More than %i PEBs are needed for fastmap, sorry.", UBI_FM_MAX_START); ubi->fm_disabled = 1; } ubi_msg(ubi, "default fastmap pool size: %d", ubi->fm_pool.max_size); ubi_msg(ubi, "default fastmap WL pool size: %d", ubi->fm_wl_pool.max_size); #else ubi->fm_disabled = 1; #endif mutex_init(&ubi->buf_mutex); mutex_init(&ubi->ckvol_mutex); mutex_init(&ubi->device_mutex); spin_lock_init(&ubi->volumes_lock); init_rwsem(&ubi->fm_protect); init_rwsem(&ubi->fm_eba_sem); ubi_msg(ubi, "attaching mtd%d", mtd->index); err = io_init(ubi, max_beb_per1024); if (err) goto out_free; err = -ENOMEM; ubi->peb_buf = vmalloc(ubi->peb_size); if (!ubi->peb_buf) goto out_free; #ifdef CONFIG_MTD_UBI_FASTMAP ubi->fm_size = ubi_calc_fm_size(ubi); ubi->fm_buf = vzalloc(ubi->fm_size); if (!ubi->fm_buf) goto out_free; #endif err = ubi_attach(ubi, disable_fm ? 1 : 0); if (err) { ubi_err(ubi, "failed to attach mtd%d, error %d", mtd->index, err); goto out_free; } if (ubi->autoresize_vol_id != -1) { err = autoresize(ubi, ubi->autoresize_vol_id); if (err) goto out_detach; } err = uif_init(ubi); if (err) goto out_detach; err = ubi_debugfs_init_dev(ubi); if (err) goto out_uif; ubi->bgt_thread = kthread_create(ubi_thread, ubi, "%s", ubi->bgt_name); if (IS_ERR(ubi->bgt_thread)) { err = PTR_ERR(ubi->bgt_thread); ubi_err(ubi, "cannot spawn \"%s\", error %d", ubi->bgt_name, err); goto out_debugfs; } ubi_msg(ubi, "attached mtd%d (name \"%s\", size %llu MiB)", mtd->index, mtd->name, ubi->flash_size >> 20); ubi_msg(ubi, "PEB size: %d bytes (%d KiB), LEB size: %d bytes", ubi->peb_size, ubi->peb_size >> 10, ubi->leb_size); ubi_msg(ubi, "min./max. I/O unit sizes: %d/%d, sub-page size %d", ubi->min_io_size, ubi->max_write_size, ubi->hdrs_min_io_size); ubi_msg(ubi, "VID header offset: %d (aligned %d), data offset: %d", ubi->vid_hdr_offset, ubi->vid_hdr_aloffset, ubi->leb_start); ubi_msg(ubi, "good PEBs: %d, bad PEBs: %d, corrupted PEBs: %d", ubi->good_peb_count, ubi->bad_peb_count, ubi->corr_peb_count); ubi_msg(ubi, "user volume: %d, internal volumes: %d, max. volumes count: %d", ubi->vol_count - UBI_INT_VOL_COUNT, UBI_INT_VOL_COUNT, ubi->vtbl_slots); ubi_msg(ubi, "max/mean erase counter: %d/%d, WL threshold: %d, image sequence number: %u", ubi->max_ec, ubi->mean_ec, CONFIG_MTD_UBI_WL_THRESHOLD, ubi->image_seq); ubi_msg(ubi, "available PEBs: %d, total reserved PEBs: %d, PEBs reserved for bad PEB handling: %d", ubi->avail_pebs, ubi->rsvd_pebs, ubi->beb_rsvd_pebs); /* * The below lock makes sure we do not race with 'ubi_thread()' which * checks @ubi->thread_enabled. Otherwise we may fail to wake it up. */ spin_lock(&ubi->wl_lock); ubi->thread_enabled = 1; wake_up_process(ubi->bgt_thread); spin_unlock(&ubi->wl_lock); ubi_devices[ubi_num] = ubi; ubi_notify_all(ubi, UBI_VOLUME_ADDED, NULL); return ubi_num; out_debugfs: ubi_debugfs_exit_dev(ubi); out_uif: uif_close(ubi); out_detach: ubi_wl_close(ubi); ubi_free_all_volumes(ubi); vfree(ubi->vtbl); out_free: vfree(ubi->peb_buf); vfree(ubi->fm_buf); put_device(&ubi->dev); return err; } /** * ubi_detach_mtd_dev - detach an MTD device. * @ubi_num: UBI device number to detach from * @anyway: detach MTD even if device reference count is not zero * * This function destroys an UBI device number @ubi_num and detaches the * underlying MTD device. Returns zero in case of success and %-EBUSY if the * UBI device is busy and cannot be destroyed, and %-EINVAL if it does not * exist. * * Note, the invocations of this function has to be serialized by the * @ubi_devices_mutex. */ int ubi_detach_mtd_dev(int ubi_num, int anyway) { struct ubi_device *ubi; if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) return -EINVAL; ubi = ubi_get_device(ubi_num); if (!ubi) return -EINVAL; spin_lock(&ubi_devices_lock); ubi->ref_count -= 1; if (ubi->ref_count) { if (!anyway) { spin_unlock(&ubi_devices_lock); return -EBUSY; } /* This may only happen if there is a bug */ ubi_err(ubi, "%s reference count %d, destroy anyway", ubi->ubi_name, ubi->ref_count); } ubi->is_dead = true; spin_unlock(&ubi_devices_lock); ubi_notify_all(ubi, UBI_VOLUME_SHUTDOWN, NULL); spin_lock(&ubi_devices_lock); put_device(&ubi->dev); ubi_devices[ubi_num] = NULL; spin_unlock(&ubi_devices_lock); ubi_assert(ubi_num == ubi->ubi_num); ubi_notify_all(ubi, UBI_VOLUME_REMOVED, NULL); ubi_msg(ubi, "detaching mtd%d", ubi->mtd->index); #ifdef CONFIG_MTD_UBI_FASTMAP /* If we don't write a new fastmap at detach time we lose all * EC updates that have been made since the last written fastmap. * In case of fastmap debugging we omit the update to simulate an * unclean shutdown. */ if (!ubi_dbg_chk_fastmap(ubi)) ubi_update_fastmap(ubi); #endif /* * Before freeing anything, we have to stop the background thread to * prevent it from doing anything on this device while we are freeing. */ if (ubi->bgt_thread) kthread_stop(ubi->bgt_thread); #ifdef CONFIG_MTD_UBI_FASTMAP cancel_work_sync(&ubi->fm_work); #endif ubi_debugfs_exit_dev(ubi); uif_close(ubi); ubi_wl_close(ubi); ubi_free_internal_volumes(ubi); vfree(ubi->vtbl); vfree(ubi->peb_buf); vfree(ubi->fm_buf); ubi_msg(ubi, "mtd%d is detached", ubi->mtd->index); put_mtd_device(ubi->mtd); put_device(&ubi->dev); return 0; } /** * open_mtd_by_chdev - open an MTD device by its character device node path. * @mtd_dev: MTD character device node path * * This helper function opens an MTD device by its character node device path. * Returns MTD device description object in case of success and a negative * error code in case of failure. */ static struct mtd_info * __init open_mtd_by_chdev(const char *mtd_dev) { int err, minor; struct path path; struct kstat stat; /* Probably this is an MTD character device node path */ err = kern_path(mtd_dev, LOOKUP_FOLLOW, &path); if (err) return ERR_PTR(err); err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); path_put(&path); if (err) return ERR_PTR(err); /* MTD device number is defined by the major / minor numbers */ if (MAJOR(stat.rdev) != MTD_CHAR_MAJOR || !S_ISCHR(stat.mode)) return ERR_PTR(-EINVAL); minor = MINOR(stat.rdev); if (minor & 1) /* * Just do not think the "/dev/mtdrX" devices support is need, * so do not support them to avoid doing extra work. */ return ERR_PTR(-EINVAL); return get_mtd_device(NULL, minor / 2); } /** * open_mtd_device - open MTD device by name, character device path, or number. * @mtd_dev: name, character device node path, or MTD device device number * * This function tries to open and MTD device described by @mtd_dev string, * which is first treated as ASCII MTD device number, and if it is not true, it * is treated as MTD device name, and if that is also not true, it is treated * as MTD character device node path. Returns MTD device description object in * case of success and a negative error code in case of failure. */ static struct mtd_info * __init open_mtd_device(const char *mtd_dev) { struct mtd_info *mtd; int mtd_num; char *endp; mtd_num = simple_strtoul(mtd_dev, &endp, 0); if (*endp != '\0' || mtd_dev == endp) { /* * This does not look like an ASCII integer, probably this is * MTD device name. */ mtd = get_mtd_device_nm(mtd_dev); if (PTR_ERR(mtd) == -ENODEV) /* Probably this is an MTD character device node path */ mtd = open_mtd_by_chdev(mtd_dev); } else mtd = get_mtd_device(NULL, mtd_num); return mtd; } static void ubi_notify_add(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); int err; if (!of_device_is_compatible(np, "linux,ubi")) return; /* * we are already holding &mtd_table_mutex, but still need * to bump refcount */ err = __get_mtd_device(mtd); if (err) return; /* called while holding mtd_table_mutex */ mutex_lock_nested(&ubi_devices_mutex, SINGLE_DEPTH_NESTING); err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO, 0, 0, false, false); mutex_unlock(&ubi_devices_mutex); if (err < 0) __put_mtd_device(mtd); } static void ubi_notify_remove(struct mtd_info *mtd) { /* do nothing for now */ } static struct mtd_notifier ubi_mtd_notifier = { .add = ubi_notify_add, .remove = ubi_notify_remove, }; static int __init ubi_init_attach(void) { int err, i, k; /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { struct mtd_dev_param *p = &mtd_dev_param[i]; struct mtd_info *mtd; cond_resched(); mtd = open_mtd_device(p->name); if (IS_ERR(mtd)) { err = PTR_ERR(mtd); pr_err("UBI error: cannot open mtd %s, error %d\n", p->name, err); /* See comment below re-ubi_is_module(). */ if (ubi_is_module()) goto out_detach; continue; } mutex_lock(&ubi_devices_mutex); err = ubi_attach_mtd_dev(mtd, p->ubi_num, p->vid_hdr_offs, p->max_beb_per1024, p->enable_fm == 0, p->need_resv_pool != 0); mutex_unlock(&ubi_devices_mutex); if (err < 0) { pr_err("UBI error: cannot attach mtd%d\n", mtd->index); put_mtd_device(mtd); /* * Originally UBI stopped initializing on any error. * However, later on it was found out that this * behavior is not very good when UBI is compiled into * the kernel and the MTD devices to attach are passed * through the command line. Indeed, UBI failure * stopped whole boot sequence. * * To fix this, we changed the behavior for the * non-module case, but preserved the old behavior for * the module case, just for compatibility. This is a * little inconsistent, though. */ if (ubi_is_module()) goto out_detach; } } return 0; out_detach: for (k = 0; k < i; k++) if (ubi_devices[k]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } return err; } #ifndef CONFIG_MTD_UBI_MODULE late_initcall(ubi_init_attach); #endif static int __init ubi_init(void) { int err; /* Ensure that EC and VID headers have correct size */ BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); if (mtd_devs > UBI_MAX_DEVICES) { pr_err("UBI error: too many MTD devices, maximum is %d\n", UBI_MAX_DEVICES); return -EINVAL; } /* Create base sysfs directory and sysfs files */ err = class_register(&ubi_class); if (err < 0) return err; err = misc_register(&ubi_ctrl_cdev); if (err) { pr_err("UBI error: cannot register device\n"); goto out; } ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", sizeof(struct ubi_wl_entry), 0, 0, NULL); if (!ubi_wl_entry_slab) { err = -ENOMEM; goto out_dev_unreg; } err = ubi_debugfs_init(); if (err) goto out_slab; err = ubiblock_init(); if (err) { pr_err("UBI error: block: cannot initialize, error %d\n", err); /* See comment above re-ubi_is_module(). */ if (ubi_is_module()) goto out_debugfs; } register_mtd_user(&ubi_mtd_notifier); if (ubi_is_module()) { err = ubi_init_attach(); if (err) goto out_mtd_notifier; } return 0; out_mtd_notifier: unregister_mtd_user(&ubi_mtd_notifier); ubiblock_exit(); out_debugfs: ubi_debugfs_exit(); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: misc_deregister(&ubi_ctrl_cdev); out: class_unregister(&ubi_class); pr_err("UBI error: cannot initialize UBI, error %d\n", err); return err; } device_initcall(ubi_init); static void __exit ubi_exit(void) { int i; ubiblock_exit(); unregister_mtd_user(&ubi_mtd_notifier); for (i = 0; i < UBI_MAX_DEVICES; i++) if (ubi_devices[i]) { mutex_lock(&ubi_devices_mutex); ubi_detach_mtd_dev(ubi_devices[i]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } ubi_debugfs_exit(); kmem_cache_destroy(ubi_wl_entry_slab); misc_deregister(&ubi_ctrl_cdev); class_unregister(&ubi_class); } module_exit(ubi_exit); /** * bytes_str_to_int - convert a number of bytes string into an integer. * @str: the string to convert * * This function returns positive resulting integer in case of success and a * negative error code in case of failure. */ static int bytes_str_to_int(const char *str) { char *endp; unsigned long result; result = simple_strtoul(str, &endp, 0); if (str == endp || result >= INT_MAX) { pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } switch (*endp) { case 'G': result *= 1024; fallthrough; case 'M': result *= 1024; fallthrough; case 'K': result *= 1024; break; case '\0': break; default: pr_err("UBI error: incorrect bytes count: \"%s\"\n", str); return -EINVAL; } return result; } /** * ubi_mtd_param_parse - parse the 'mtd=' UBI parameter. * @val: the parameter value to parse * @kp: not used * * This function returns zero in case of success and a negative error code in * case of error. */ static int ubi_mtd_param_parse(const char *val, const struct kernel_param *kp) { int i, len; struct mtd_dev_param *p; char buf[MTD_PARAM_LEN_MAX]; char *pbuf = &buf[0]; char *tokens[MTD_PARAM_MAX_COUNT], *token; if (!val) return -EINVAL; if (mtd_devs == UBI_MAX_DEVICES) { pr_err("UBI error: too many parameters, max. is %d\n", UBI_MAX_DEVICES); return -EINVAL; } len = strnlen(val, MTD_PARAM_LEN_MAX); if (len == MTD_PARAM_LEN_MAX) { pr_err("UBI error: parameter \"%s\" is too long, max. is %d\n", val, MTD_PARAM_LEN_MAX); return -EINVAL; } if (len == 0) { pr_warn("UBI warning: empty 'mtd=' parameter - ignored\n"); return 0; } strcpy(buf, val); /* Get rid of the final newline */ if (buf[len - 1] == '\n') buf[len - 1] = '\0'; for (i = 0; i < MTD_PARAM_MAX_COUNT; i++) tokens[i] = strsep(&pbuf, ","); if (pbuf) { pr_err("UBI error: too many arguments at \"%s\"\n", val); return -EINVAL; } p = &mtd_dev_param[mtd_devs]; strcpy(&p->name[0], tokens[0]); token = tokens[1]; if (token) { p->vid_hdr_offs = bytes_str_to_int(token); if (p->vid_hdr_offs < 0) return p->vid_hdr_offs; } token = tokens[2]; if (token) { int err = kstrtoint(token, 10, &p->max_beb_per1024); if (err) { pr_err("UBI error: bad value for max_beb_per1024 parameter: %s\n", token); return -EINVAL; } } token = tokens[3]; if (token) { int err = kstrtoint(token, 10, &p->ubi_num); if (err || p->ubi_num < UBI_DEV_NUM_AUTO) { pr_err("UBI error: bad value for ubi_num parameter: %s\n", token); return -EINVAL; } } else p->ubi_num = UBI_DEV_NUM_AUTO; token = tokens[4]; if (token) { int err = kstrtoint(token, 10, &p->enable_fm); if (err) { pr_err("UBI error: bad value for enable_fm parameter: %s\n", token); return -EINVAL; } } else p->enable_fm = 0; token = tokens[5]; if (token) { int err = kstrtoint(token, 10, &p->need_resv_pool); if (err) { pr_err("UBI error: bad value for need_resv_pool parameter: %s\n", token); return -EINVAL; } } else p->need_resv_pool = 0; mtd_devs += 1; return 0; } module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 0400); MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: mtd=<name|num|path>[,<vid_hdr_offs>[,max_beb_per1024[,ubi_num]]].\n" "Multiple \"mtd\" parameters may be specified.\n" "MTD devices may be specified by their number, name, or path to the MTD character device node.\n" "Optional \"vid_hdr_offs\" parameter specifies UBI VID header position to be used by UBI. (default value if 0)\n" "Optional \"max_beb_per1024\" parameter specifies the maximum expected bad eraseblock per 1024 eraseblocks. (default value (" __stringify(CONFIG_MTD_UBI_BEB_LIMIT) ") if 0)\n" "Optional \"ubi_num\" parameter specifies UBI device number which have to be assigned to the newly created UBI device (assigned automatically by default)\n" "Optional \"enable_fm\" parameter determines whether to enable fastmap during attach. If the value is non-zero, fastmap is enabled. Default value is 0.\n" "Optional \"need_resv_pool\" parameter determines whether to reserve pool->max_size pebs during attach. If the value is non-zero, peb reservation is enabled. Default value is 0.\n" "\n" "Example 1: mtd=/dev/mtd0 - attach MTD device /dev/mtd0.\n" "Example 2: mtd=content,1984 mtd=4 - attach MTD device with name \"content\" using VID header offset 1984, and MTD device number 4 with default VID header offset.\n" "Example 3: mtd=/dev/mtd1,0,25 - attach MTD device /dev/mtd1 using default VID header offset and reserve 25*nand_size_in_blocks/1024 erase blocks for bad block handling.\n" "Example 4: mtd=/dev/mtd1,0,0,5 - attach MTD device /dev/mtd1 to UBI 5 and using default values for the other fields.\n" "example 5: mtd=1,0,0,5 mtd=2,0,0,6,1 - attach MTD device /dev/mtd1 to UBI 5 and disable fastmap; attach MTD device /dev/mtd2 to UBI 6 and enable fastmap.(only works when fastmap is enabled and fm_autoconvert=Y).\n" "\t(e.g. if the NAND *chipset* has 4096 PEB, 100 will be reserved for this UBI device)."); #ifdef CONFIG_MTD_UBI_FASTMAP module_param(fm_autoconvert, bool, 0644); MODULE_PARM_DESC(fm_autoconvert, "Set this parameter to enable fastmap automatically on images without a fastmap."); module_param(fm_debug, bool, 0); MODULE_PARM_DESC(fm_debug, "Set this parameter to enable fastmap debugging by default. Warning, this will make fastmap slow!"); #endif MODULE_VERSION(__stringify(UBI_VERSION)); MODULE_DESCRIPTION("UBI - Unsorted Block Images"); MODULE_AUTHOR("Artem Bityutskiy"); MODULE_LICENSE("GPL"); |
1127 1128 11 11 7 12 4 1 1 19 18 3 2 4 1 1 1 13 13 13 13 13 13 1 13 2 4 4 4 4 12 12 12 4 4 4 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 2 1 26 11 11 1 1 1 1 1 1 1 1 9 16 17 1 17 17 2 2 2 1 1 1 1 1 1 1 1 12 1 12 1 1 1 1 13 1 1 17 16 17 1 1 1 1 1 4 4 4 13 12 12 12 12 12 2 2 2 4 4 4 2 4 4 4 18 18 18 18 18 18 11 19 19 1 7 11 19 7 12 1 1 26 26 1 1 25 25 25 3 25 20 20 2 19 19 18 18 19 19 2 19 7 12 12 12 2 12 2 2 12 12 12 12 12 12 12 7 7 7 7 7 7 9 4 4 4 4 4 4 4 3 4 4 4 4 4 4 2 4 1 4 1 4 1 5 2 18 1 1 1 18 1 1 1 18 1 1 1 17 17 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 6 10 10 10 9 9 9 9 9 8 9 12 12 11 4 4 11 4 4 12 28 28 27 28 7 7 7 5 5 5 5 4 4 4 4 4 68 68 68 7 6 6 7 7 10 10 10 20 27 27 4 27 3 3 3 3 2 21 22 22 20 22 2 1 1 2 1 31 31 31 31 31 31 31 31 30 31 31 25 4 4 1 1 1 1 1 1 1 1 1 1 1 1 24 23 22 1 21 24 18 21 3 26 26 26 26 26 21 26 1 26 23 4 4 1 1 20 20 16 17 23 26 25 24 24 26 1 28 1 26 28 26 26 26 26 26 26 26 26 1 26 26 26 4 2 2 2 2 18 18 18 17 15 16 17 16 16 15 12 14 5 14 1 14 14 12 11 14 13 11 12 12 6 3 3 3 2 3 11 9 9 11 9 2 18 18 18 18 18 18 17 17 18 4 18 14 18 18 18 18 18 18 18 17 18 18 18 17 1 1 1 1 18 18 18 18 2 1 2 25 24 16 17 17 17 17 17 17 17 17 13 6 7 11 17 16 17 25 24 25 25 25 5 14 12 14 14 1125 35 1 1 6 10 10 10 3 3 3 5 1127 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/team/team.c - Network team device driver * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #include <linux/ethtool.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/errno.h> #include <linux/ctype.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netpoll.h> #include <linux/if_vlan.h> #include <linux/if_arp.h> #include <linux/socket.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <net/genetlink.h> #include <net/netdev_lock.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <linux/if_team.h> #include "team_nl.h" #define DRV_NAME "team" /********** * Helpers **********/ static struct team_port *team_port_get_rtnl(const struct net_device *dev) { struct team_port *port = rtnl_dereference(dev->rx_handler_data); return netif_is_team_port(dev) ? port : NULL; } /* * Since the ability to change device address for open port device is tested in * team_port_add, this function can be called without control of return value */ static int __set_port_dev_addr(struct net_device *port_dev, const unsigned char *dev_addr) { struct sockaddr_storage addr; memcpy(addr.__data, dev_addr, port_dev->addr_len); addr.ss_family = port_dev->type; return dev_set_mac_address(port_dev, (struct sockaddr *)&addr, NULL); } static int team_port_set_orig_dev_addr(struct team_port *port) { return __set_port_dev_addr(port->dev, port->orig.dev_addr); } static int team_port_set_team_dev_addr(struct team *team, struct team_port *port) { return __set_port_dev_addr(port->dev, team->dev->dev_addr); } int team_modeop_port_enter(struct team *team, struct team_port *port) { return team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_enter); void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port) { team_port_set_team_dev_addr(team, port); } EXPORT_SYMBOL(team_modeop_port_change_dev_addr); static void team_lower_state_changed(struct team_port *port) { struct netdev_lag_lower_state_info info; info.link_up = port->linkup; info.tx_enabled = team_port_enabled(port); netdev_lower_state_changed(port->dev, &info); } static void team_refresh_port_linkup(struct team_port *port) { bool new_linkup = port->user.linkup_enabled ? port->user.linkup : port->state.linkup; if (port->linkup != new_linkup) { port->linkup = new_linkup; team_lower_state_changed(port); } } /******************* * Options handling *******************/ struct team_option_inst { /* One for each option instance */ struct list_head list; struct list_head tmp_list; struct team_option *option; struct team_option_inst_info info; bool changed; bool removed; }; static struct team_option *__team_find_option(struct team *team, const char *opt_name) { struct team_option *option; list_for_each_entry(option, &team->option_list, list) { if (strcmp(option->name, opt_name) == 0) return option; } return NULL; } static void __team_option_inst_del(struct team_option_inst *opt_inst) { list_del(&opt_inst->list); kfree(opt_inst); } static void __team_option_inst_del_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option == option) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add(struct team *team, struct team_option *option, struct team_port *port) { struct team_option_inst *opt_inst; unsigned int array_size; unsigned int i; array_size = option->array_size; if (!array_size) array_size = 1; /* No array but still need one instance */ for (i = 0; i < array_size; i++) { opt_inst = kmalloc(sizeof(*opt_inst), GFP_KERNEL); if (!opt_inst) return -ENOMEM; opt_inst->option = option; opt_inst->info.port = port; opt_inst->info.array_index = i; opt_inst->changed = true; opt_inst->removed = false; list_add_tail(&opt_inst->list, &team->option_inst_list); if (option->init) option->init(team, &opt_inst->info); } return 0; } static int __team_option_inst_add_option(struct team *team, struct team_option *option) { int err; if (!option->per_port) { err = __team_option_inst_add(team, option, NULL); if (err) goto inst_del_option; } return 0; inst_del_option: __team_option_inst_del_option(team, option); return err; } static void __team_option_inst_mark_removed_option(struct team *team, struct team_option *option) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->option == option) { opt_inst->changed = true; opt_inst->removed = true; } } } static void __team_option_inst_del_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst, *tmp; list_for_each_entry_safe(opt_inst, tmp, &team->option_inst_list, list) { if (opt_inst->option->per_port && opt_inst->info.port == port) __team_option_inst_del(opt_inst); } } static int __team_option_inst_add_port(struct team *team, struct team_port *port) { struct team_option *option; int err; list_for_each_entry(option, &team->option_list, list) { if (!option->per_port) continue; err = __team_option_inst_add(team, option, port); if (err) goto inst_del_port; } return 0; inst_del_port: __team_option_inst_del_port(team, port); return err; } static void __team_option_inst_mark_removed_port(struct team *team, struct team_port *port) { struct team_option_inst *opt_inst; list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->info.port == port) { opt_inst->changed = true; opt_inst->removed = true; } } } static int __team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int i; struct team_option **dst_opts; int err; dst_opts = kcalloc(option_count, sizeof(struct team_option *), GFP_KERNEL); if (!dst_opts) return -ENOMEM; for (i = 0; i < option_count; i++, option++) { if (__team_find_option(team, option->name)) { err = -EEXIST; goto alloc_rollback; } dst_opts[i] = kmemdup(option, sizeof(*option), GFP_KERNEL); if (!dst_opts[i]) { err = -ENOMEM; goto alloc_rollback; } } for (i = 0; i < option_count; i++) { err = __team_option_inst_add_option(team, dst_opts[i]); if (err) goto inst_rollback; list_add_tail(&dst_opts[i]->list, &team->option_list); } kfree(dst_opts); return 0; inst_rollback: for (i--; i >= 0; i--) { __team_option_inst_del_option(team, dst_opts[i]); list_del(&dst_opts[i]->list); } i = option_count; alloc_rollback: for (i--; i >= 0; i--) kfree(dst_opts[i]); kfree(dst_opts); return err; } static void __team_options_mark_removed(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) __team_option_inst_mark_removed_option(team, del_opt); } } static void __team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { int i; for (i = 0; i < option_count; i++, option++) { struct team_option *del_opt; del_opt = __team_find_option(team, option->name); if (del_opt) { __team_option_inst_del_option(team, del_opt); list_del(&del_opt->list); kfree(del_opt); } } } static void __team_options_change_check(struct team *team); int team_options_register(struct team *team, const struct team_option *option, size_t option_count) { int err; err = __team_options_register(team, option, option_count); if (err) return err; __team_options_change_check(team); return 0; } EXPORT_SYMBOL(team_options_register); void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count) { __team_options_mark_removed(team, option, option_count); __team_options_change_check(team); __team_options_unregister(team, option, option_count); } EXPORT_SYMBOL(team_options_unregister); static int team_option_get(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->getter) return -EOPNOTSUPP; opt_inst->option->getter(team, ctx); return 0; } static int team_option_set(struct team *team, struct team_option_inst *opt_inst, struct team_gsetter_ctx *ctx) { if (!opt_inst->option->setter) return -EOPNOTSUPP; return opt_inst->option->setter(team, ctx); } void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info) { struct team_option_inst *opt_inst; opt_inst = container_of(opt_inst_info, struct team_option_inst, info); opt_inst->changed = true; } EXPORT_SYMBOL(team_option_inst_set_change); void team_options_change_check(struct team *team) { __team_options_change_check(team); } EXPORT_SYMBOL(team_options_change_check); /**************** * Mode handling ****************/ static LIST_HEAD(mode_list); static DEFINE_SPINLOCK(mode_list_lock); struct team_mode_item { struct list_head list; const struct team_mode *mode; }; static struct team_mode_item *__find_mode(const char *kind) { struct team_mode_item *mitem; list_for_each_entry(mitem, &mode_list, list) { if (strcmp(mitem->mode->kind, kind) == 0) return mitem; } return NULL; } static bool is_good_mode_name(const char *name) { while (*name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') return false; name++; } return true; } int team_mode_register(const struct team_mode *mode) { int err = 0; struct team_mode_item *mitem; if (!is_good_mode_name(mode->kind) || mode->priv_size > TEAM_MODE_PRIV_SIZE) return -EINVAL; mitem = kmalloc(sizeof(*mitem), GFP_KERNEL); if (!mitem) return -ENOMEM; spin_lock(&mode_list_lock); if (__find_mode(mode->kind)) { err = -EEXIST; kfree(mitem); goto unlock; } mitem->mode = mode; list_add_tail(&mitem->list, &mode_list); unlock: spin_unlock(&mode_list_lock); return err; } EXPORT_SYMBOL(team_mode_register); void team_mode_unregister(const struct team_mode *mode) { struct team_mode_item *mitem; spin_lock(&mode_list_lock); mitem = __find_mode(mode->kind); if (mitem) { list_del_init(&mitem->list); kfree(mitem); } spin_unlock(&mode_list_lock); } EXPORT_SYMBOL(team_mode_unregister); static const struct team_mode *team_mode_get(const char *kind) { struct team_mode_item *mitem; const struct team_mode *mode = NULL; if (!try_module_get(THIS_MODULE)) return NULL; spin_lock(&mode_list_lock); mitem = __find_mode(kind); if (!mitem) { spin_unlock(&mode_list_lock); request_module("team-mode-%s", kind); spin_lock(&mode_list_lock); mitem = __find_mode(kind); } if (mitem) { mode = mitem->mode; if (!try_module_get(mode->owner)) mode = NULL; } spin_unlock(&mode_list_lock); module_put(THIS_MODULE); return mode; } static void team_mode_put(const struct team_mode *mode) { module_put(mode->owner); } static bool team_dummy_transmit(struct team *team, struct sk_buff *skb) { dev_kfree_skb_any(skb); return false; } static rx_handler_result_t team_dummy_receive(struct team *team, struct team_port *port, struct sk_buff *skb) { return RX_HANDLER_ANOTHER; } static const struct team_mode __team_no_mode = { .kind = "*NOMODE*", }; static bool team_is_mode_set(struct team *team) { return team->mode != &__team_no_mode; } static void team_set_no_mode(struct team *team) { team->user_carrier_enabled = false; team->mode = &__team_no_mode; } static void team_adjust_ops(struct team *team) { /* * To avoid checks in rx/tx skb paths, ensure here that non-null and * correct ops are always set. */ if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->transmit) team->ops.transmit = team_dummy_transmit; else team->ops.transmit = team->mode->ops->transmit; if (!team->en_port_count || !team_is_mode_set(team) || !team->mode->ops->receive) team->ops.receive = team_dummy_receive; else team->ops.receive = team->mode->ops->receive; } /* * We can benefit from the fact that it's ensured no port is present * at the time of mode change. Therefore no packets are in fly so there's no * need to set mode operations in any special way. */ static int __team_change_mode(struct team *team, const struct team_mode *new_mode) { /* Check if mode was previously set and do cleanup if so */ if (team_is_mode_set(team)) { void (*exit_op)(struct team *team) = team->ops.exit; /* Clear ops area so no callback is called any longer */ memset(&team->ops, 0, sizeof(struct team_mode_ops)); team_adjust_ops(team); if (exit_op) exit_op(team); team_mode_put(team->mode); team_set_no_mode(team); /* zero private data area */ memset(&team->mode_priv, 0, sizeof(struct team) - offsetof(struct team, mode_priv)); } if (!new_mode) return 0; if (new_mode->ops->init) { int err; err = new_mode->ops->init(team); if (err) return err; } team->mode = new_mode; memcpy(&team->ops, new_mode->ops, sizeof(struct team_mode_ops)); team_adjust_ops(team); return 0; } static int team_change_mode(struct team *team, const char *kind) { const struct team_mode *new_mode; struct net_device *dev = team->dev; int err; if (!list_empty(&team->port_list)) { netdev_err(dev, "No ports can be present during mode change\n"); return -EBUSY; } if (team_is_mode_set(team) && strcmp(team->mode->kind, kind) == 0) { netdev_err(dev, "Unable to change to the same mode the team is in\n"); return -EINVAL; } new_mode = team_mode_get(kind); if (!new_mode) { netdev_err(dev, "Mode \"%s\" not found\n", kind); return -EINVAL; } err = __team_change_mode(team, new_mode); if (err) { netdev_err(dev, "Failed to change to mode \"%s\"\n", kind); team_mode_put(new_mode); return err; } netdev_info(dev, "Mode changed to \"%s\"\n", kind); return 0; } /********************* * Peers notification *********************/ static void team_notify_peers_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, notify_peers.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->notify_peers.dw, 0); return; } val = atomic_dec_if_positive(&team->notify_peers.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->notify_peers.dw, msecs_to_jiffies(team->notify_peers.interval)); } static void team_notify_peers(struct team *team) { if (!team->notify_peers.count || !netif_running(team->dev)) return; atomic_add(team->notify_peers.count, &team->notify_peers.count_pending); schedule_delayed_work(&team->notify_peers.dw, 0); } static void team_notify_peers_init(struct team *team) { INIT_DELAYED_WORK(&team->notify_peers.dw, team_notify_peers_work); } static void team_notify_peers_fini(struct team *team) { cancel_delayed_work_sync(&team->notify_peers.dw); } /******************************* * Send multicast group rejoins *******************************/ static void team_mcast_rejoin_work(struct work_struct *work) { struct team *team; int val; team = container_of(work, struct team, mcast_rejoin.dw.work); if (!rtnl_trylock()) { schedule_delayed_work(&team->mcast_rejoin.dw, 0); return; } val = atomic_dec_if_positive(&team->mcast_rejoin.count_pending); if (val < 0) { rtnl_unlock(); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, team->dev); rtnl_unlock(); if (val) schedule_delayed_work(&team->mcast_rejoin.dw, msecs_to_jiffies(team->mcast_rejoin.interval)); } static void team_mcast_rejoin(struct team *team) { if (!team->mcast_rejoin.count || !netif_running(team->dev)) return; atomic_add(team->mcast_rejoin.count, &team->mcast_rejoin.count_pending); schedule_delayed_work(&team->mcast_rejoin.dw, 0); } static void team_mcast_rejoin_init(struct team *team) { INIT_DELAYED_WORK(&team->mcast_rejoin.dw, team_mcast_rejoin_work); } static void team_mcast_rejoin_fini(struct team *team) { cancel_delayed_work_sync(&team->mcast_rejoin.dw); } /************************ * Rx path frame handler ************************/ /* note: already called with rcu_read_lock */ static rx_handler_result_t team_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct team_port *port; struct team *team; rx_handler_result_t res; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return RX_HANDLER_CONSUMED; *pskb = skb; port = team_port_get_rcu(skb->dev); team = port->team; if (!team_port_enabled(port)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) /* link-local packets are mostly useful when stack receives them * with the link they arrive on. */ return RX_HANDLER_PASS; /* allow exact match delivery for disabled ports */ res = RX_HANDLER_EXACT; } else { res = team->ops.receive(team, port, skb); } if (res == RX_HANDLER_ANOTHER) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->rx_packets); u64_stats_add(&pcpu_stats->rx_bytes, skb->len); if (skb->pkt_type == PACKET_MULTICAST) u64_stats_inc(&pcpu_stats->rx_multicast); u64_stats_update_end(&pcpu_stats->syncp); skb->dev = team->dev; } else if (res == RX_HANDLER_EXACT) { this_cpu_inc(team->pcpu_stats->rx_nohandler); } else { this_cpu_inc(team->pcpu_stats->rx_dropped); } return res; } /************************************* * Multiqueue Tx port select override *************************************/ static int team_queue_override_init(struct team *team) { struct list_head *listarr; unsigned int queue_cnt = team->dev->num_tx_queues - 1; unsigned int i; if (!queue_cnt) return 0; listarr = kmalloc_array(queue_cnt, sizeof(struct list_head), GFP_KERNEL); if (!listarr) return -ENOMEM; team->qom_lists = listarr; for (i = 0; i < queue_cnt; i++) INIT_LIST_HEAD(listarr++); return 0; } static void team_queue_override_fini(struct team *team) { kfree(team->qom_lists); } static struct list_head *__team_get_qom_list(struct team *team, u16 queue_id) { return &team->qom_lists[queue_id - 1]; } /* * note: already called with rcu_read_lock */ static bool team_queue_override_transmit(struct team *team, struct sk_buff *skb) { struct list_head *qom_list; struct team_port *port; if (!team->queue_override_enabled || !skb->queue_mapping) return false; qom_list = __team_get_qom_list(team, skb->queue_mapping); list_for_each_entry_rcu(port, qom_list, qom_list) { if (!team_dev_queue_xmit(team, port, skb)) return true; } return false; } static void __team_queue_override_port_del(struct team *team, struct team_port *port) { if (!port->queue_id) return; list_del_rcu(&port->qom_list); } static bool team_queue_override_port_has_gt_prio_than(struct team_port *port, struct team_port *cur) { if (port->priority < cur->priority) return true; if (port->priority > cur->priority) return false; if (port->index < cur->index) return true; return false; } static void __team_queue_override_port_add(struct team *team, struct team_port *port) { struct team_port *cur; struct list_head *qom_list; struct list_head *node; if (!port->queue_id) return; qom_list = __team_get_qom_list(team, port->queue_id); node = qom_list; list_for_each_entry(cur, qom_list, qom_list) { if (team_queue_override_port_has_gt_prio_than(port, cur)) break; node = &cur->qom_list; } list_add_tail_rcu(&port->qom_list, node); } static void __team_queue_override_enabled_check(struct team *team) { struct team_port *port; bool enabled = false; list_for_each_entry(port, &team->port_list, list) { if (port->queue_id) { enabled = true; break; } } if (enabled == team->queue_override_enabled) return; netdev_dbg(team->dev, "%s queue override\n", enabled ? "Enabling" : "Disabling"); team->queue_override_enabled = enabled; } static void team_queue_override_port_prio_changed(struct team *team, struct team_port *port) { if (!port->queue_id || team_port_enabled(port)) return; __team_queue_override_port_del(team, port); __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_change_queue_id(struct team *team, struct team_port *port, u16 new_queue_id) { if (team_port_enabled(port)) { __team_queue_override_port_del(team, port); port->queue_id = new_queue_id; __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } else { port->queue_id = new_queue_id; } } static void team_queue_override_port_add(struct team *team, struct team_port *port) { __team_queue_override_port_add(team, port); __team_queue_override_enabled_check(team); } static void team_queue_override_port_del(struct team *team, struct team_port *port) { __team_queue_override_port_del(team, port); __team_queue_override_enabled_check(team); } /**************** * Port handling ****************/ static bool team_port_find(const struct team *team, const struct team_port *port) { struct team_port *cur; list_for_each_entry(cur, &team->port_list, list) if (cur == port) return true; return false; } /* * Enable/disable port by adding to enabled port hashlist and setting * port->index (Might be racy so reader could see incorrect ifindex when * processing a flying packet, but that is not a problem). Write guarded * by team->lock. */ static void team_port_enable(struct team *team, struct team_port *port) { if (team_port_enabled(port)) return; port->index = team->en_port_count++; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); team_adjust_ops(team); team_queue_override_port_add(team, port); if (team->ops.port_enabled) team->ops.port_enabled(team, port); team_notify_peers(team); team_mcast_rejoin(team); team_lower_state_changed(port); } static void __reconstruct_port_hlist(struct team *team, int rm_index) { int i; struct team_port *port; for (i = rm_index + 1; i < team->en_port_count; i++) { port = team_get_port_by_index(team, i); hlist_del_rcu(&port->hlist); port->index--; hlist_add_head_rcu(&port->hlist, team_port_index_hash(team, port->index)); } } static void team_port_disable(struct team *team, struct team_port *port) { if (!team_port_enabled(port)) return; if (team->ops.port_disabled) team->ops.port_disabled(team, port); hlist_del_rcu(&port->hlist); __reconstruct_port_hlist(team, port->index); port->index = -1; team->en_port_count--; team_queue_override_port_del(team, port); team_adjust_ops(team); team_lower_state_changed(port); } #define TEAM_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ NETIF_F_HIGHDMA | NETIF_F_LRO | \ NETIF_F_GSO_ENCAP_ALL) #define TEAM_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE) static void __team_compute_features(struct team *team) { struct team_port *port; netdev_features_t vlan_features = TEAM_VLAN_FEATURES; netdev_features_t enc_features = TEAM_ENC_FEATURES; unsigned short max_hard_header_len = ETH_HLEN; unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; rcu_read_lock(); if (list_empty(&team->port_list)) goto done; vlan_features = netdev_base_features(vlan_features); enc_features = netdev_base_features(enc_features); list_for_each_entry_rcu(port, &team->port_list, list) { vlan_features = netdev_increment_features(vlan_features, port->dev->vlan_features, TEAM_VLAN_FEATURES); enc_features = netdev_increment_features(enc_features, port->dev->hw_enc_features, TEAM_ENC_FEATURES); dst_release_flag &= port->dev->priv_flags; if (port->dev->hard_header_len > max_hard_header_len) max_hard_header_len = port->dev->hard_header_len; } done: rcu_read_unlock(); team->dev->vlan_features = vlan_features; team->dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; team->dev->hard_header_len = max_hard_header_len; team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) team->dev->priv_flags |= IFF_XMIT_DST_RELEASE; } static void team_compute_features(struct team *team) { __team_compute_features(team); netdev_change_features(team->dev); } static int team_port_enter(struct team *team, struct team_port *port) { int err = 0; dev_hold(team->dev); if (team->ops.port_enter) { err = team->ops.port_enter(team, port); if (err) { netdev_err(team->dev, "Device %s failed to enter team mode\n", port->dev->name); goto err_port_enter; } } return 0; err_port_enter: dev_put(team->dev); return err; } static void team_port_leave(struct team *team, struct team_port *port) { if (team->ops.port_leave) team->ops.port_leave(team, port); dev_put(team->dev); } #ifdef CONFIG_NET_POLL_CONTROLLER static int __team_port_enable_netpoll(struct team_port *port) { struct netpoll *np; int err; np = kzalloc(sizeof(*np), GFP_KERNEL); if (!np) return -ENOMEM; err = __netpoll_setup(np, port->dev); if (err) { kfree(np); return err; } port->np = np; return err; } static int team_port_enable_netpoll(struct team_port *port) { if (!port->team->dev->npinfo) return 0; return __team_port_enable_netpoll(port); } static void team_port_disable_netpoll(struct team_port *port) { struct netpoll *np = port->np; if (!np) return; port->np = NULL; __netpoll_free(np); } #else static int team_port_enable_netpoll(struct team_port *port) { return 0; } static void team_port_disable_netpoll(struct team_port *port) { } #endif static int team_upper_dev_link(struct team *team, struct team_port *port, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; int err; lag_upper_info.tx_type = team->mode->lag_tx_type; lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; err = netdev_master_upper_dev_link(port->dev, team->dev, NULL, &lag_upper_info, extack); if (err) return err; port->dev->priv_flags |= IFF_TEAM_PORT; return 0; } static void team_upper_dev_unlink(struct team *team, struct team_port *port) { netdev_upper_dev_unlink(port->dev, team->dev); port->dev->priv_flags &= ~IFF_TEAM_PORT; } static void __team_port_change_port_added(struct team_port *port, bool linkup); static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev); static int team_port_add(struct team *team, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; int err; if (port_dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Loopback device can't be added as a team port"); netdev_err(dev, "Device %s is loopback device. Loopback devices can't be added as a team port\n", portname); return -EINVAL; } if (netif_is_team_port(port_dev)) { NL_SET_ERR_MSG(extack, "Device is already a port of a team device"); netdev_err(dev, "Device %s is already a port " "of a team device\n", portname); return -EBUSY; } if (dev == port_dev) { NL_SET_ERR_MSG(extack, "Cannot enslave team device to itself"); netdev_err(dev, "Cannot enslave team device to itself\n"); return -EINVAL; } if (netdev_has_upper_dev(dev, port_dev)) { NL_SET_ERR_MSG(extack, "Device is already an upper device of the team interface"); netdev_err(dev, "Device %s is already an upper device of the team interface\n", portname); return -EBUSY; } if (netdev_has_upper_dev(port_dev, dev)) { NL_SET_ERR_MSG(extack, "Device is already a lower device of the team interface"); netdev_err(dev, "Device %s is already a lower device of the team interface\n", portname); return -EBUSY; } if (port_dev->features & NETIF_F_VLAN_CHALLENGED && vlan_uses_dev(dev)) { NL_SET_ERR_MSG(extack, "Device is VLAN challenged and team device has VLAN set up"); netdev_err(dev, "Device %s is VLAN challenged and team device has VLAN set up\n", portname); return -EPERM; } err = team_dev_type_check_change(dev, port_dev); if (err) return err; if (port_dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Device is up. Set it down before adding it as a team port"); netdev_err(dev, "Device %s is up. Set it down before adding it as a team port\n", portname); return -EBUSY; } port = kzalloc(sizeof(struct team_port) + team->mode->port_priv_size, GFP_KERNEL); if (!port) return -ENOMEM; port->dev = port_dev; port->team = team; INIT_LIST_HEAD(&port->qom_list); port->orig.mtu = port_dev->mtu; err = dev_set_mtu(port_dev, dev->mtu); if (err) { netdev_dbg(dev, "Error %d calling dev_set_mtu\n", err); goto err_set_mtu; } memcpy(port->orig.dev_addr, port_dev->dev_addr, port_dev->addr_len); err = team_port_enter(team, port); if (err) { netdev_err(dev, "Device %s failed to enter team mode\n", portname); goto err_port_enter; } err = dev_open(port_dev, extack); if (err) { netdev_dbg(dev, "Device %s opening failed\n", portname); goto err_dev_open; } err = vlan_vids_add_by_dev(port_dev, dev); if (err) { netdev_err(dev, "Failed to add vlan ids to device %s\n", portname); goto err_vids_add; } err = team_port_enable_netpoll(port); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); goto err_enable_netpoll; } if (!(dev->features & NETIF_F_LRO)) dev_disable_lro(port_dev); err = netdev_rx_handler_register(port_dev, team_handle_frame, port); if (err) { netdev_err(dev, "Device %s failed to register rx_handler\n", portname); goto err_handler_register; } err = team_upper_dev_link(team, port, extack); if (err) { netdev_err(dev, "Device %s failed to set upper link\n", portname); goto err_set_upper_link; } err = __team_option_inst_add_port(team, port); if (err) { netdev_err(dev, "Device %s failed to add per-port options\n", portname); goto err_option_port_add; } /* set promiscuity level to new slave */ if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(port_dev, 1); if (err) goto err_set_slave_promisc; } /* set allmulti level to new slave */ if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(port_dev, 1); if (err) { if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); goto err_set_slave_promisc; } } if (dev->flags & IFF_UP) { netif_addr_lock_bh(dev); dev_uc_sync_multiple(port_dev, dev); dev_mc_sync_multiple(port_dev, dev); netif_addr_unlock_bh(dev); } port->index = -1; list_add_tail_rcu(&port->list, &team->port_list); team_port_enable(team, port); __team_compute_features(team); __team_port_change_port_added(port, !!netif_oper_up(port_dev)); __team_options_change_check(team); netdev_info(dev, "Port device %s added\n", portname); return 0; err_set_slave_promisc: __team_option_inst_del_port(team, port); err_option_port_add: team_upper_dev_unlink(team, port); err_set_upper_link: netdev_rx_handler_unregister(port_dev); err_handler_register: team_port_disable_netpoll(port); err_enable_netpoll: vlan_vids_del_by_dev(port_dev, dev); err_vids_add: dev_close(port_dev); err_dev_open: team_port_leave(team, port); team_port_set_orig_dev_addr(port); err_port_enter: dev_set_mtu(port_dev, port->orig.mtu); err_set_mtu: kfree(port); return err; } static void __team_port_change_port_removed(struct team_port *port); static int team_port_del(struct team *team, struct net_device *port_dev) { struct net_device *dev = team->dev; struct team_port *port; char *portname = port_dev->name; port = team_port_get_rtnl(port_dev); if (!port || !team_port_find(team, port)) { netdev_err(dev, "Device %s does not act as a port of this team\n", portname); return -ENOENT; } team_port_disable(team, port); list_del_rcu(&port->list); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(port_dev, -1); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(port_dev, -1); team_upper_dev_unlink(team, port); netdev_rx_handler_unregister(port_dev); team_port_disable_netpoll(port); vlan_vids_del_by_dev(port_dev, dev); if (dev->flags & IFF_UP) { dev_uc_unsync(port_dev, dev); dev_mc_unsync(port_dev, dev); } dev_close(port_dev); team_port_leave(team, port); __team_option_inst_mark_removed_port(team, port); __team_options_change_check(team); __team_option_inst_del_port(team, port); __team_port_change_port_removed(port); team_port_set_orig_dev_addr(port); dev_set_mtu(port_dev, port->orig.mtu); kfree_rcu(port, rcu); netdev_info(dev, "Port device %s removed\n", portname); __team_compute_features(team); return 0; } /***************** * Net device ops *****************/ static void team_mode_option_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.str_val = team->mode->kind; } static int team_mode_option_set(struct team *team, struct team_gsetter_ctx *ctx) { return team_change_mode(team, ctx->data.str_val); } static void team_notify_peers_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.count; } static int team_notify_peers_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.count = ctx->data.u32_val; return 0; } static void team_notify_peers_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->notify_peers.interval; } static int team_notify_peers_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->notify_peers.interval = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_count_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.count; } static int team_mcast_rejoin_count_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.count = ctx->data.u32_val; return 0; } static void team_mcast_rejoin_interval_get(struct team *team, struct team_gsetter_ctx *ctx) { ctx->data.u32_val = team->mcast_rejoin.interval; } static int team_mcast_rejoin_interval_set(struct team *team, struct team_gsetter_ctx *ctx) { team->mcast_rejoin.interval = ctx->data.u32_val; return 0; } static void team_port_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = team_port_enabled(port); } static int team_port_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; if (ctx->data.bool_val) team_port_enable(team, port); else team_port_disable(team, port); return 0; } static void team_user_linkup_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup; } static void __team_carrier_check(struct team *team); static int team_user_linkup_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_user_linkup_en_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.bool_val = port->user.linkup_enabled; } static int team_user_linkup_en_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; port->user.linkup_enabled = ctx->data.bool_val; team_refresh_port_linkup(port); __team_carrier_check(port->team); return 0; } static void team_priority_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.s32_val = port->priority; } static int team_priority_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; s32 priority = ctx->data.s32_val; if (port->priority == priority) return 0; port->priority = priority; team_queue_override_port_prio_changed(team, port); return 0; } static void team_queue_id_option_get(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; ctx->data.u32_val = port->queue_id; } static int team_queue_id_option_set(struct team *team, struct team_gsetter_ctx *ctx) { struct team_port *port = ctx->info->port; u16 new_queue_id = ctx->data.u32_val; if (port->queue_id == new_queue_id) return 0; if (new_queue_id >= team->dev->real_num_tx_queues) return -EINVAL; team_queue_override_port_change_queue_id(team, port, new_queue_id); return 0; } static const struct team_option team_options[] = { { .name = "mode", .type = TEAM_OPTION_TYPE_STRING, .getter = team_mode_option_get, .setter = team_mode_option_set, }, { .name = "notify_peers_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_count_get, .setter = team_notify_peers_count_set, }, { .name = "notify_peers_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_notify_peers_interval_get, .setter = team_notify_peers_interval_set, }, { .name = "mcast_rejoin_count", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_count_get, .setter = team_mcast_rejoin_count_set, }, { .name = "mcast_rejoin_interval", .type = TEAM_OPTION_TYPE_U32, .getter = team_mcast_rejoin_interval_get, .setter = team_mcast_rejoin_interval_set, }, { .name = "enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_port_en_option_get, .setter = team_port_en_option_set, }, { .name = "user_linkup", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_option_get, .setter = team_user_linkup_option_set, }, { .name = "user_linkup_enabled", .type = TEAM_OPTION_TYPE_BOOL, .per_port = true, .getter = team_user_linkup_en_option_get, .setter = team_user_linkup_en_option_set, }, { .name = "priority", .type = TEAM_OPTION_TYPE_S32, .per_port = true, .getter = team_priority_option_get, .setter = team_priority_option_set, }, { .name = "queue_id", .type = TEAM_OPTION_TYPE_U32, .per_port = true, .getter = team_queue_id_option_get, .setter = team_queue_id_option_set, }, }; static int team_init(struct net_device *dev) { struct team *team = netdev_priv(dev); int i; int err; team->dev = dev; team_set_no_mode(team); team->notifier_ctx = false; team->pcpu_stats = netdev_alloc_pcpu_stats(struct team_pcpu_stats); if (!team->pcpu_stats) return -ENOMEM; for (i = 0; i < TEAM_PORT_HASHENTRIES; i++) INIT_HLIST_HEAD(&team->en_port_hlist[i]); INIT_LIST_HEAD(&team->port_list); err = team_queue_override_init(team); if (err) goto err_team_queue_override_init; team_adjust_ops(team); INIT_LIST_HEAD(&team->option_list); INIT_LIST_HEAD(&team->option_inst_list); team_notify_peers_init(team); team_mcast_rejoin_init(team); err = team_options_register(team, team_options, ARRAY_SIZE(team_options)); if (err) goto err_options_register; netif_carrier_off(dev); lockdep_register_key(&team->team_lock_key); __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); netdev_lockdep_set_classes(dev); return 0; err_options_register: team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); err_team_queue_override_init: free_percpu(team->pcpu_stats); return err; } static void team_uninit(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; struct team_port *tmp; mutex_lock(&team->lock); list_for_each_entry_safe(port, tmp, &team->port_list, list) team_port_del(team, port->dev); __team_change_mode(team, NULL); /* cleanup */ __team_options_unregister(team, team_options, ARRAY_SIZE(team_options)); team_mcast_rejoin_fini(team); team_notify_peers_fini(team); team_queue_override_fini(team); mutex_unlock(&team->lock); netdev_change_features(dev); lockdep_unregister_key(&team->team_lock_key); } static void team_destructor(struct net_device *dev) { struct team *team = netdev_priv(dev); free_percpu(team->pcpu_stats); } static int team_open(struct net_device *dev) { return 0; } static int team_close(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; list_for_each_entry(port, &team->port_list, list) { dev_uc_unsync(port->dev, dev); dev_mc_unsync(port->dev, dev); } return 0; } /* * note: already called with rcu_read_lock */ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) { struct team *team = netdev_priv(dev); bool tx_success; unsigned int len = skb->len; tx_success = team_queue_override_transmit(team, skb); if (!tx_success) tx_success = team->ops.transmit(team, skb); if (tx_success) { struct team_pcpu_stats *pcpu_stats; pcpu_stats = this_cpu_ptr(team->pcpu_stats); u64_stats_update_begin(&pcpu_stats->syncp); u64_stats_inc(&pcpu_stats->tx_packets); u64_stats_add(&pcpu_stats->tx_bytes, len); u64_stats_update_end(&pcpu_stats->syncp); } else { this_cpu_inc(team->pcpu_stats->tx_dropped); } return NETDEV_TX_OK; } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* * This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the team driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* * Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping; if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static void team_change_rx_flags(struct net_device *dev, int change) { struct team *team = netdev_priv(dev); struct team_port *port; int inc; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { if (change & IFF_PROMISC) { inc = dev->flags & IFF_PROMISC ? 1 : -1; dev_set_promiscuity(port->dev, inc); } if (change & IFF_ALLMULTI) { inc = dev->flags & IFF_ALLMULTI ? 1 : -1; dev_set_allmulti(port->dev, inc); } } rcu_read_unlock(); } static void team_set_rx_mode(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { dev_uc_sync_multiple(port->dev, dev); dev_mc_sync_multiple(port->dev, dev); } rcu_read_unlock(); } static int team_set_mac_address(struct net_device *dev, void *p) { struct sockaddr *addr = p; struct team *team = netdev_priv(dev); struct team_port *port; if (dev->type == ARPHRD_ETHER && !is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; dev_addr_set(dev, addr->sa_data); mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) if (team->ops.port_change_dev_addr) team->ops.port_change_dev_addr(team, port); mutex_unlock(&team->lock); return 0; } static int team_change_mtu(struct net_device *dev, int new_mtu) { struct team *team = netdev_priv(dev); struct team_port *port; int err; /* * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ mutex_lock(&team->lock); team->port_mtu_change_allowed = true; list_for_each_entry(port, &team->port_list, list) { err = dev_set_mtu(port->dev, new_mtu); if (err) { netdev_err(dev, "Device %s failed to change mtu", port->dev->name); goto unwind; } } team->port_mtu_change_allowed = false; mutex_unlock(&team->lock); WRITE_ONCE(dev->mtu, new_mtu); return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) dev_set_mtu(port->dev, dev->mtu); team->port_mtu_change_allowed = false; mutex_unlock(&team->lock); return err; } static void team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct team *team = netdev_priv(dev); struct team_pcpu_stats *p; u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes; u32 rx_dropped = 0, tx_dropped = 0, rx_nohandler = 0; unsigned int start; int i; for_each_possible_cpu(i) { p = per_cpu_ptr(team->pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rx_packets = u64_stats_read(&p->rx_packets); rx_bytes = u64_stats_read(&p->rx_bytes); rx_multicast = u64_stats_read(&p->rx_multicast); tx_packets = u64_stats_read(&p->tx_packets); tx_bytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rx_packets; stats->rx_bytes += rx_bytes; stats->multicast += rx_multicast; stats->tx_packets += tx_packets; stats->tx_bytes += tx_bytes; /* * rx_dropped, tx_dropped & rx_nohandler are u32, * updated without syncp protection. */ rx_dropped += READ_ONCE(p->rx_dropped); tx_dropped += READ_ONCE(p->tx_dropped); rx_nohandler += READ_ONCE(p->rx_nohandler); } stats->rx_dropped = rx_dropped; stats->tx_dropped = tx_dropped; stats->rx_nohandler = rx_nohandler; } static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; int err; /* * Alhough this is reader, it's guarded by team lock. It's not possible * to traverse list in reverse under rcu_read_lock */ mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = vlan_vid_add(port->dev, proto, vid); if (err) goto unwind; } mutex_unlock(&team->lock); return 0; unwind: list_for_each_entry_continue_reverse(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); mutex_unlock(&team->lock); return err; } static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct team *team = netdev_priv(dev); struct team_port *port; mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) vlan_vid_del(port->dev, proto, vid); mutex_unlock(&team->lock); return 0; } #ifdef CONFIG_NET_POLL_CONTROLLER static void team_poll_controller(struct net_device *dev) { } static void __team_netpoll_cleanup(struct team *team) { struct team_port *port; list_for_each_entry(port, &team->port_list, list) team_port_disable_netpoll(port); } static void team_netpoll_cleanup(struct net_device *dev) { struct team *team = netdev_priv(dev); mutex_lock(&team->lock); __team_netpoll_cleanup(team); mutex_unlock(&team->lock); } static int team_netpoll_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); struct team_port *port; int err = 0; mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { err = __team_port_enable_netpoll(port); if (err) { __team_netpoll_cleanup(team); break; } } mutex_unlock(&team->lock); return err; } #endif static int team_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { struct team *team = netdev_priv(dev); int err; mutex_lock(&team->lock); err = team_port_add(team, port_dev, extack); mutex_unlock(&team->lock); if (!err) netdev_change_features(dev); return err; } static int team_del_slave(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); int err; mutex_lock(&team->lock); err = team_port_del(team, port_dev); mutex_unlock(&team->lock); if (err) return err; if (netif_is_team_master(port_dev)) { lockdep_unregister_key(&team->team_lock_key); lockdep_register_key(&team->team_lock_key); lockdep_set_class(&team->lock, &team->team_lock_key); } netdev_change_features(dev); return err; } static netdev_features_t team_fix_features(struct net_device *dev, netdev_features_t features) { struct team_port *port; struct team *team = netdev_priv(dev); netdev_features_t mask; mask = features; features = netdev_base_features(features); rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { features = netdev_increment_features(features, port->dev->features, mask); } rcu_read_unlock(); features = netdev_add_tso_features(features, mask); return features; } static int team_change_carrier(struct net_device *dev, bool new_carrier) { struct team *team = netdev_priv(dev); team->user_carrier_enabled = true; if (new_carrier) netif_carrier_on(dev); else netif_carrier_off(dev); return 0; } static const struct net_device_ops team_netdev_ops = { .ndo_init = team_init, .ndo_uninit = team_uninit, .ndo_open = team_open, .ndo_stop = team_close, .ndo_start_xmit = team_xmit, .ndo_select_queue = team_select_queue, .ndo_change_rx_flags = team_change_rx_flags, .ndo_set_rx_mode = team_set_rx_mode, .ndo_set_mac_address = team_set_mac_address, .ndo_change_mtu = team_change_mtu, .ndo_get_stats64 = team_get_stats64, .ndo_vlan_rx_add_vid = team_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = team_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = team_poll_controller, .ndo_netpoll_setup = team_netpoll_setup, .ndo_netpoll_cleanup = team_netpoll_cleanup, #endif .ndo_add_slave = team_add_slave, .ndo_del_slave = team_del_slave, .ndo_fix_features = team_fix_features, .ndo_change_carrier = team_change_carrier, .ndo_features_check = passthru_features_check, }; /*********************** * ethtool interface ***********************/ static void team_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); } static int team_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct team *team= netdev_priv(dev); unsigned long speed = 0; struct team_port *port; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; rcu_read_lock(); list_for_each_entry_rcu(port, &team->port_list, list) { if (team_port_txable(port)) { if (port->state.speed != SPEED_UNKNOWN) speed += port->state.speed; if (cmd->base.duplex == DUPLEX_UNKNOWN && port->state.duplex != DUPLEX_UNKNOWN) cmd->base.duplex = port->state.duplex; } } rcu_read_unlock(); cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static const struct ethtool_ops team_ethtool_ops = { .get_drvinfo = team_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = team_ethtool_get_link_ksettings, }; /*********************** * rt netlink interface ***********************/ static void team_setup_by_port(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); if (port_dev->type == ARPHRD_ETHER) dev->header_ops = team->header_ops_cache; else dev->header_ops = port_dev->header_ops; dev->type = port_dev->type; dev->hard_header_len = port_dev->hard_header_len; dev->needed_headroom = port_dev->needed_headroom; dev->addr_len = port_dev->addr_len; dev->mtu = port_dev->mtu; memcpy(dev->broadcast, port_dev->broadcast, port_dev->addr_len); eth_hw_addr_inherit(dev, port_dev); if (port_dev->flags & IFF_POINTOPOINT) { dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } else if ((port_dev->flags & (IFF_BROADCAST | IFF_MULTICAST)) == (IFF_BROADCAST | IFF_MULTICAST)) { dev->flags |= (IFF_BROADCAST | IFF_MULTICAST); dev->flags &= ~(IFF_POINTOPOINT | IFF_NOARP); } } static int team_dev_type_check_change(struct net_device *dev, struct net_device *port_dev) { struct team *team = netdev_priv(dev); char *portname = port_dev->name; int err; if (dev->type == port_dev->type) return 0; if (!list_empty(&team->port_list)) { netdev_err(dev, "Device %s is of different type\n", portname); return -EBUSY; } err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev); err = notifier_to_errno(err); if (err) { netdev_err(dev, "Refused to change device type\n"); return err; } dev_uc_flush(dev); dev_mc_flush(dev); team_setup_by_port(dev, port_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, dev); return 0; } static void team_setup(struct net_device *dev) { struct team *team = netdev_priv(dev); ether_setup(dev); dev->max_mtu = ETH_MAX_MTU; team->header_ops_cache = dev->header_ops; dev->netdev_ops = &team_netdev_ops; dev->ethtool_ops = &team_ethtool_ops; dev->needs_free_netdev = true; dev->priv_destructor = team_destructor; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_TEAM; /* * Indicate we support unicast address filtering. That way core won't * bring us to promisc mode in case a unicast addr is added. * Let this up to underlay drivers. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; dev->lltx = true; /* Don't allow team devices to change network namespaces. */ dev->netns_immutable = true; dev->features |= NETIF_F_GRO; dev->hw_features = TEAM_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; dev->features |= dev->hw_features; dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } static int team_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **tb = params->tb; if (tb[IFLA_ADDRESS] == NULL) eth_hw_addr_random(dev); return register_netdevice(dev); } static int team_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static unsigned int team_get_num_tx_queues(void) { return TEAM_DEFAULT_NUM_TX_QUEUES; } static unsigned int team_get_num_rx_queues(void) { return TEAM_DEFAULT_NUM_RX_QUEUES; } static struct rtnl_link_ops team_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct team), .setup = team_setup, .newlink = team_newlink, .validate = team_validate, .get_num_tx_queues = team_get_num_tx_queues, .get_num_rx_queues = team_get_num_rx_queues, }; /*********************************** * Generic netlink custom interface ***********************************/ static struct genl_family team_nl_family; int team_nl_noop_doit(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *hdr; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &team_nl_family, 0, TEAM_CMD_NOOP); if (!hdr) { err = -EMSGSIZE; goto err_msg_put; } genlmsg_end(msg, hdr); return genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); err_msg_put: nlmsg_free(msg); return err; } /* * Netlink cmd functions should be locked by following two functions. * Since dev gets held here, that ensures dev won't disappear in between. */ static struct team *team_nl_team_get(struct genl_info *info) { struct net *net = genl_info_net(info); int ifindex; struct net_device *dev; struct team *team; if (!info->attrs[TEAM_ATTR_TEAM_IFINDEX]) return NULL; ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]); dev = dev_get_by_index(net, ifindex); if (!dev || dev->netdev_ops != &team_netdev_ops) { dev_put(dev); return NULL; } team = netdev_priv(dev); mutex_lock(&team->lock); return team; } static void team_nl_team_put(struct team *team) { mutex_unlock(&team->lock); dev_put(team->dev); } typedef int team_nl_send_func_t(struct sk_buff *skb, struct team *team, u32 portid); static int team_nl_send_unicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_unicast(dev_net(team->dev), skb, portid); } static int team_nl_fill_one_option_get(struct sk_buff *skb, struct team *team, struct team_option_inst *opt_inst) { struct nlattr *option_item; struct team_option *option = opt_inst->option; struct team_option_inst_info *opt_inst_info = &opt_inst->info; struct team_gsetter_ctx ctx; int err; ctx.info = opt_inst_info; err = team_option_get(team, opt_inst, &ctx); if (err) return err; option_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_OPTION); if (!option_item) return -EMSGSIZE; if (nla_put_string(skb, TEAM_ATTR_OPTION_NAME, option->name)) goto nest_cancel; if (opt_inst_info->port && nla_put_u32(skb, TEAM_ATTR_OPTION_PORT_IFINDEX, opt_inst_info->port->dev->ifindex)) goto nest_cancel; if (opt_inst->option->array_size && nla_put_u32(skb, TEAM_ATTR_OPTION_ARRAY_INDEX, opt_inst_info->array_index)) goto nest_cancel; switch (option->type) { case TEAM_OPTION_TYPE_U32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_U32)) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.u32_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_STRING: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_STRING)) goto nest_cancel; if (nla_put_string(skb, TEAM_ATTR_OPTION_DATA, ctx.data.str_val)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BINARY: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_BINARY)) goto nest_cancel; if (nla_put(skb, TEAM_ATTR_OPTION_DATA, ctx.data.bin_val.len, ctx.data.bin_val.ptr)) goto nest_cancel; break; case TEAM_OPTION_TYPE_BOOL: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_FLAG)) goto nest_cancel; if (ctx.data.bool_val && nla_put_flag(skb, TEAM_ATTR_OPTION_DATA)) goto nest_cancel; break; case TEAM_OPTION_TYPE_S32: if (nla_put_u8(skb, TEAM_ATTR_OPTION_TYPE, NLA_S32)) goto nest_cancel; if (nla_put_s32(skb, TEAM_ATTR_OPTION_DATA, ctx.data.s32_val)) goto nest_cancel; break; default: BUG(); } if (opt_inst->removed && nla_put_flag(skb, TEAM_ATTR_OPTION_REMOVED)) goto nest_cancel; if (opt_inst->changed) { if (nla_put_flag(skb, TEAM_ATTR_OPTION_CHANGED)) goto nest_cancel; opt_inst->changed = false; } nla_nest_end(skb, option_item); return 0; nest_cancel: nla_nest_cancel(skb, option_item); return -EMSGSIZE; } static int __send_and_alloc_skb(struct sk_buff **pskb, struct team *team, u32 portid, team_nl_send_func_t *send_func) { int err; if (*pskb) { err = send_func(*pskb, team, portid); if (err) return err; } *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!*pskb) return -ENOMEM; return 0; } static int team_nl_send_options_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct list_head *sel_opt_inst_list) { struct nlattr *option_list; struct nlmsghdr *nlh; void *hdr; struct team_option_inst *opt_inst; int err; struct sk_buff *skb = NULL; bool incomplete; int i; opt_inst = list_first_entry(sel_opt_inst_list, struct team_option_inst, tmp_list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_OPTIONS_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; option_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_OPTION); if (!option_list) goto nla_put_failure; i = 0; incomplete = false; list_for_each_entry_from(opt_inst, sel_opt_inst_list, tmp_list) { err = team_nl_fill_one_option_get(skb, team, opt_inst); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } nla_nest_end(skb, option_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } int team_nl_options_get_doit(struct sk_buff *skb, struct genl_info *info) { struct team *team; struct team_option_inst *opt_inst; int err; LIST_HEAD(sel_opt_inst_list); team = team_nl_team_get(info); if (!team) return -EINVAL; list_for_each_entry(opt_inst, &team->option_inst_list, list) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); err = team_nl_send_options_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, &sel_opt_inst_list); team_nl_team_put(team); return err; } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list); int team_nl_options_set_doit(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err = 0; int i; struct nlattr *nl_option; rtnl_lock(); team = team_nl_team_get(info); if (!team) { err = -EINVAL; goto rtnl_unlock; } err = -EINVAL; if (!info->attrs[TEAM_ATTR_LIST_OPTION]) { err = -EINVAL; goto team_put; } nla_for_each_nested(nl_option, info->attrs[TEAM_ATTR_LIST_OPTION], i) { struct nlattr *opt_attrs[TEAM_ATTR_OPTION_MAX + 1]; struct nlattr *attr; struct nlattr *attr_data; LIST_HEAD(opt_inst_list); enum team_option_type opt_type; int opt_port_ifindex = 0; /* != 0 for per-port options */ u32 opt_array_index = 0; bool opt_is_array = false; struct team_option_inst *opt_inst; char *opt_name; bool opt_found = false; if (nla_type(nl_option) != TEAM_ATTR_ITEM_OPTION) { err = -EINVAL; goto team_put; } err = nla_parse_nested_deprecated(opt_attrs, TEAM_ATTR_OPTION_MAX, nl_option, team_attr_option_nl_policy, info->extack); if (err) goto team_put; if (!opt_attrs[TEAM_ATTR_OPTION_NAME] || !opt_attrs[TEAM_ATTR_OPTION_TYPE]) { err = -EINVAL; goto team_put; } switch (nla_get_u8(opt_attrs[TEAM_ATTR_OPTION_TYPE])) { case NLA_U32: opt_type = TEAM_OPTION_TYPE_U32; break; case NLA_STRING: opt_type = TEAM_OPTION_TYPE_STRING; break; case NLA_BINARY: opt_type = TEAM_OPTION_TYPE_BINARY; break; case NLA_FLAG: opt_type = TEAM_OPTION_TYPE_BOOL; break; case NLA_S32: opt_type = TEAM_OPTION_TYPE_S32; break; default: goto team_put; } attr_data = opt_attrs[TEAM_ATTR_OPTION_DATA]; if (opt_type != TEAM_OPTION_TYPE_BOOL && !attr_data) { err = -EINVAL; goto team_put; } opt_name = nla_data(opt_attrs[TEAM_ATTR_OPTION_NAME]); attr = opt_attrs[TEAM_ATTR_OPTION_PORT_IFINDEX]; if (attr) opt_port_ifindex = nla_get_u32(attr); attr = opt_attrs[TEAM_ATTR_OPTION_ARRAY_INDEX]; if (attr) { opt_is_array = true; opt_array_index = nla_get_u32(attr); } list_for_each_entry(opt_inst, &team->option_inst_list, list) { struct team_option *option = opt_inst->option; struct team_gsetter_ctx ctx; struct team_option_inst_info *opt_inst_info; int tmp_ifindex; opt_inst_info = &opt_inst->info; tmp_ifindex = opt_inst_info->port ? opt_inst_info->port->dev->ifindex : 0; if (option->type != opt_type || strcmp(option->name, opt_name) || tmp_ifindex != opt_port_ifindex || (option->array_size && !opt_is_array) || opt_inst_info->array_index != opt_array_index) continue; opt_found = true; ctx.info = opt_inst_info; switch (opt_type) { case TEAM_OPTION_TYPE_U32: ctx.data.u32_val = nla_get_u32(attr_data); break; case TEAM_OPTION_TYPE_STRING: if (nla_len(attr_data) > TEAM_STRING_MAX_LEN || !memchr(nla_data(attr_data), '\0', nla_len(attr_data))) { err = -EINVAL; goto team_put; } ctx.data.str_val = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BINARY: ctx.data.bin_val.len = nla_len(attr_data); ctx.data.bin_val.ptr = nla_data(attr_data); break; case TEAM_OPTION_TYPE_BOOL: ctx.data.bool_val = attr_data ? true : false; break; case TEAM_OPTION_TYPE_S32: ctx.data.s32_val = nla_get_s32(attr_data); break; default: BUG(); } err = team_option_set(team, opt_inst, &ctx); if (err) goto team_put; opt_inst->changed = true; list_add(&opt_inst->tmp_list, &opt_inst_list); } if (!opt_found) { err = -ENOENT; goto team_put; } err = team_nl_send_event_options_get(team, &opt_inst_list); if (err) break; } team_put: team_nl_team_put(team); rtnl_unlock: rtnl_unlock(); return err; } static int team_nl_fill_one_port_get(struct sk_buff *skb, struct team_port *port) { struct nlattr *port_item; port_item = nla_nest_start_noflag(skb, TEAM_ATTR_ITEM_PORT); if (!port_item) goto nest_cancel; if (nla_put_u32(skb, TEAM_ATTR_PORT_IFINDEX, port->dev->ifindex)) goto nest_cancel; if (port->changed) { if (nla_put_flag(skb, TEAM_ATTR_PORT_CHANGED)) goto nest_cancel; port->changed = false; } if ((port->removed && nla_put_flag(skb, TEAM_ATTR_PORT_REMOVED)) || (port->state.linkup && nla_put_flag(skb, TEAM_ATTR_PORT_LINKUP)) || nla_put_u32(skb, TEAM_ATTR_PORT_SPEED, port->state.speed) || nla_put_u8(skb, TEAM_ATTR_PORT_DUPLEX, port->state.duplex)) goto nest_cancel; nla_nest_end(skb, port_item); return 0; nest_cancel: nla_nest_cancel(skb, port_item); return -EMSGSIZE; } static int team_nl_send_port_list_get(struct team *team, u32 portid, u32 seq, int flags, team_nl_send_func_t *send_func, struct team_port *one_port) { struct nlattr *port_list; struct nlmsghdr *nlh; void *hdr; struct team_port *port; int err; struct sk_buff *skb = NULL; bool incomplete; int i; port = list_first_entry_or_null(&team->port_list, struct team_port, list); start_again: err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_PORT_LIST_GET); if (!hdr) { nlmsg_free(skb); return -EMSGSIZE; } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; port_list = nla_nest_start_noflag(skb, TEAM_ATTR_LIST_PORT); if (!port_list) goto nla_put_failure; i = 0; incomplete = false; /* If one port is selected, called wants to send port list containing * only this port. Otherwise go through all listed ports and send all */ if (one_port) { err = team_nl_fill_one_port_get(skb, one_port); if (err) goto errout; } else if (port) { list_for_each_entry_from(port, &team->port_list, list) { err = team_nl_fill_one_port_get(skb, port); if (err) { if (err == -EMSGSIZE) { if (!i) goto errout; incomplete = true; break; } goto errout; } i++; } } nla_nest_end(skb, port_list); genlmsg_end(skb, hdr); if (incomplete) goto start_again; send_done: nlh = nlmsg_put(skb, portid, seq, NLMSG_DONE, 0, flags | NLM_F_MULTI); if (!nlh) { err = __send_and_alloc_skb(&skb, team, portid, send_func); if (err) return err; goto send_done; } return send_func(skb, team, portid); nla_put_failure: err = -EMSGSIZE; errout: nlmsg_free(skb); return err; } int team_nl_port_list_get_doit(struct sk_buff *skb, struct genl_info *info) { struct team *team; int err; team = team_nl_team_get(info); if (!team) return -EINVAL; err = team_nl_send_port_list_get(team, info->snd_portid, info->snd_seq, NLM_F_ACK, team_nl_send_unicast, NULL); team_nl_team_put(team); return err; } static const struct genl_multicast_group team_nl_mcgrps[] = { { .name = TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME, }, }; static struct genl_family team_nl_family __ro_after_init = { .name = TEAM_GENL_NAME, .version = TEAM_GENL_VERSION, .maxattr = ARRAY_SIZE(team_nl_policy) - 1, .policy = team_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = team_nl_ops, .n_small_ops = ARRAY_SIZE(team_nl_ops), .resv_start_op = TEAM_CMD_PORT_LIST_GET + 1, .mcgrps = team_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(team_nl_mcgrps), }; static int team_nl_send_multicast(struct sk_buff *skb, struct team *team, u32 portid) { return genlmsg_multicast_netns(&team_nl_family, dev_net(team->dev), skb, 0, 0, GFP_KERNEL); } static int team_nl_send_event_options_get(struct team *team, struct list_head *sel_opt_inst_list) { return team_nl_send_options_get(team, 0, 0, 0, team_nl_send_multicast, sel_opt_inst_list); } static int team_nl_send_event_port_get(struct team *team, struct team_port *port) { return team_nl_send_port_list_get(team, 0, 0, 0, team_nl_send_multicast, port); } static int __init team_nl_init(void) { return genl_register_family(&team_nl_family); } static void __exit team_nl_fini(void) { genl_unregister_family(&team_nl_family); } /****************** * Change checkers ******************/ static void __team_options_change_check(struct team *team) { int err; struct team_option_inst *opt_inst; LIST_HEAD(sel_opt_inst_list); list_for_each_entry(opt_inst, &team->option_inst_list, list) { if (opt_inst->changed) list_add_tail(&opt_inst->tmp_list, &sel_opt_inst_list); } err = team_nl_send_event_options_get(team, &sel_opt_inst_list); if (err && err != -ESRCH) netdev_warn(team->dev, "Failed to send options change via netlink (err %d)\n", err); } /* rtnl lock is held */ static void __team_port_change_send(struct team_port *port, bool linkup) { int err; port->changed = true; port->state.linkup = linkup; team_refresh_port_linkup(port); if (linkup) { struct ethtool_link_ksettings ecmd; err = __ethtool_get_link_ksettings(port->dev, &ecmd); if (!err) { port->state.speed = ecmd.base.speed; port->state.duplex = ecmd.base.duplex; goto send_event; } } port->state.speed = 0; port->state.duplex = 0; send_event: err = team_nl_send_event_port_get(port->team, port); if (err && err != -ESRCH) netdev_warn(port->team->dev, "Failed to send port change of device %s via netlink (err %d)\n", port->dev->name, err); } static void __team_carrier_check(struct team *team) { struct team_port *port; bool team_linkup; if (team->user_carrier_enabled) return; team_linkup = false; list_for_each_entry(port, &team->port_list, list) { if (port->linkup) { team_linkup = true; break; } } if (team_linkup) netif_carrier_on(team->dev); else netif_carrier_off(team->dev); } static void __team_port_change_check(struct team_port *port, bool linkup) { if (port->state.linkup != linkup) __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_added(struct team_port *port, bool linkup) { __team_port_change_send(port, linkup); __team_carrier_check(port->team); } static void __team_port_change_port_removed(struct team_port *port) { port->removed = true; __team_port_change_send(port, false); __team_carrier_check(port->team); } static void team_port_change_check(struct team_port *port, bool linkup) { struct team *team = port->team; mutex_lock(&team->lock); __team_port_change_check(port, linkup); mutex_unlock(&team->lock); } /************************************ * Net device notifier event handler ************************************/ static int team_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct team_port *port; port = team_port_get_rtnl(dev); if (!port) return NOTIFY_DONE; switch (event) { case NETDEV_UP: if (netif_oper_up(dev)) team_port_change_check(port, true); break; case NETDEV_DOWN: team_port_change_check(port, false); break; case NETDEV_CHANGE: if (netif_running(port->dev)) team_port_change_check(port, !!netif_oper_up(port->dev)); break; case NETDEV_UNREGISTER: team_del_slave(port->team->dev, dev); break; case NETDEV_FEAT_CHANGE: if (!port->team->notifier_ctx) { port->team->notifier_ctx = true; team_compute_features(port->team); port->team->notifier_ctx = false; } break; case NETDEV_PRECHANGEMTU: /* Forbid to change mtu of underlaying device */ if (!port->team->port_mtu_change_allowed) return NOTIFY_BAD; break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid to change type of underlaying device */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, port->team->dev); break; } return NOTIFY_DONE; } static struct notifier_block team_notifier_block __read_mostly = { .notifier_call = team_device_event, }; /*********************** * Module init and exit ***********************/ static int __init team_module_init(void) { int err; register_netdevice_notifier(&team_notifier_block); err = rtnl_link_register(&team_link_ops); if (err) goto err_rtnl_reg; err = team_nl_init(); if (err) goto err_nl_init; return 0; err_nl_init: rtnl_link_unregister(&team_link_ops); err_rtnl_reg: unregister_netdevice_notifier(&team_notifier_block); return err; } static void __exit team_module_exit(void) { team_nl_fini(); rtnl_link_unregister(&team_link_ops); unregister_netdevice_notifier(&team_notifier_block); } module_init(team_module_init); module_exit(team_module_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>"); MODULE_DESCRIPTION("Ethernet team device driver"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); |
5 5 5 3 5 3 2 2 2 2 2 2 1 2 2 1 1 1 2 2 2 2 2 2 2 3 2 3 2 1 2 2 2 1 2 2 1 2 4 4 4 4 14 14 14 14 14 4 4 4 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2020 Linaro Limited * * Author: Daniel Lezcano <daniel.lezcano@linaro.org> * * Generic netlink for thermal management framework */ #include <linux/module.h> #include <linux/notifier.h> #include <linux/kernel.h> #include <net/sock.h> #include <net/genetlink.h> #include <uapi/linux/thermal.h> #include "thermal_core.h" static const struct genl_multicast_group thermal_genl_mcgrps[] = { [THERMAL_GENL_SAMPLING_GROUP] = { .name = THERMAL_GENL_SAMPLING_GROUP_NAME, }, [THERMAL_GENL_EVENT_GROUP] = { .name = THERMAL_GENL_EVENT_GROUP_NAME, }, }; static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] = { /* Thermal zone */ [THERMAL_GENL_ATTR_TZ] = { .type = NLA_NESTED }, [THERMAL_GENL_ATTR_TZ_ID] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_TEMP] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_TRIP] = { .type = NLA_NESTED }, [THERMAL_GENL_ATTR_TZ_TRIP_ID] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_TRIP_TEMP] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_TRIP_TYPE] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_TRIP_HYST] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_MODE] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_CDEV_WEIGHT] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_TZ_NAME] = { .type = NLA_STRING, .len = THERMAL_NAME_LENGTH }, /* Governor(s) */ [THERMAL_GENL_ATTR_TZ_GOV] = { .type = NLA_NESTED }, [THERMAL_GENL_ATTR_TZ_GOV_NAME] = { .type = NLA_STRING, .len = THERMAL_NAME_LENGTH }, /* Cooling devices */ [THERMAL_GENL_ATTR_CDEV] = { .type = NLA_NESTED }, [THERMAL_GENL_ATTR_CDEV_ID] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_CDEV_CUR_STATE] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_CDEV_MAX_STATE] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_CDEV_NAME] = { .type = NLA_STRING, .len = THERMAL_NAME_LENGTH }, /* CPU capabilities */ [THERMAL_GENL_ATTR_CPU_CAPABILITY] = { .type = NLA_NESTED }, [THERMAL_GENL_ATTR_CPU_CAPABILITY_ID] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY] = { .type = NLA_U32 }, /* Thresholds */ [THERMAL_GENL_ATTR_THRESHOLD] = { .type = NLA_NESTED }, [THERMAL_GENL_ATTR_THRESHOLD_TEMP] = { .type = NLA_U32 }, [THERMAL_GENL_ATTR_THRESHOLD_DIRECTION] = { .type = NLA_U32 }, }; struct param { struct nlattr **attrs; struct sk_buff *msg; const char *name; int tz_id; int cdev_id; int trip_id; int trip_temp; int trip_type; int trip_hyst; int temp; int prev_temp; int direction; int cdev_state; int cdev_max_state; struct thermal_genl_cpu_caps *cpu_capabilities; int cpu_capabilities_count; }; typedef int (*cb_t)(struct param *); static struct genl_family thermal_genl_family; static BLOCKING_NOTIFIER_HEAD(thermal_genl_chain); static int thermal_group_has_listeners(enum thermal_genl_multicast_groups group) { return genl_has_listeners(&thermal_genl_family, &init_net, group); } /************************** Sampling encoding *******************************/ int thermal_genl_sampling_temp(int id, int temp) { struct sk_buff *skb; void *hdr; if (!thermal_group_has_listeners(THERMAL_GENL_SAMPLING_GROUP)) return 0; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, 0, 0, &thermal_genl_family, 0, THERMAL_GENL_SAMPLING_TEMP); if (!hdr) goto out_free; if (nla_put_u32(skb, THERMAL_GENL_ATTR_TZ_ID, id)) goto out_cancel; if (nla_put_u32(skb, THERMAL_GENL_ATTR_TZ_TEMP, temp)) goto out_cancel; genlmsg_end(skb, hdr); genlmsg_multicast(&thermal_genl_family, skb, 0, THERMAL_GENL_SAMPLING_GROUP, GFP_KERNEL); return 0; out_cancel: genlmsg_cancel(skb, hdr); out_free: nlmsg_free(skb); return -EMSGSIZE; } /**************************** Event encoding *********************************/ static int thermal_genl_event_tz_create(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || nla_put_string(p->msg, THERMAL_GENL_ATTR_TZ_NAME, p->name)) return -EMSGSIZE; return 0; } static int thermal_genl_event_tz(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id)) return -EMSGSIZE; return 0; } static int thermal_genl_event_tz_trip_up(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TEMP, p->temp)) return -EMSGSIZE; return 0; } static int thermal_genl_event_tz_trip_change(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, p->trip_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, p->trip_type) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, p->trip_temp) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, p->trip_hyst)) return -EMSGSIZE; return 0; } static int thermal_genl_event_cdev_add(struct param *p) { if (nla_put_string(p->msg, THERMAL_GENL_ATTR_CDEV_NAME, p->name) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, p->cdev_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_MAX_STATE, p->cdev_max_state)) return -EMSGSIZE; return 0; } static int thermal_genl_event_cdev_delete(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, p->cdev_id)) return -EMSGSIZE; return 0; } static int thermal_genl_event_cdev_state_update(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_ID, p->cdev_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_CDEV_CUR_STATE, p->cdev_state)) return -EMSGSIZE; return 0; } static int thermal_genl_event_gov_change(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || nla_put_string(p->msg, THERMAL_GENL_ATTR_GOV_NAME, p->name)) return -EMSGSIZE; return 0; } static int thermal_genl_event_cpu_capability_change(struct param *p) { struct thermal_genl_cpu_caps *cpu_cap = p->cpu_capabilities; struct sk_buff *msg = p->msg; struct nlattr *start_cap; int i; start_cap = nla_nest_start(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY); if (!start_cap) return -EMSGSIZE; for (i = 0; i < p->cpu_capabilities_count; ++i) { if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_ID, cpu_cap->cpu)) goto out_cancel_nest; if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE, cpu_cap->performance)) goto out_cancel_nest; if (nla_put_u32(msg, THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY, cpu_cap->efficiency)) goto out_cancel_nest; ++cpu_cap; } nla_nest_end(msg, start_cap); return 0; out_cancel_nest: nla_nest_cancel(msg, start_cap); return -EMSGSIZE; } static int thermal_genl_event_threshold_add(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_THRESHOLD_TEMP, p->temp) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, p->direction)) return -EMSGSIZE; return 0; } static int thermal_genl_event_threshold_flush(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id)) return -EMSGSIZE; return 0; } static int thermal_genl_event_threshold_up(struct param *p) { if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_PREV_TEMP, p->prev_temp) || nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TEMP, p->temp)) return -EMSGSIZE; return 0; } int thermal_genl_event_tz_delete(struct param *p) __attribute__((alias("thermal_genl_event_tz"))); int thermal_genl_event_tz_enable(struct param *p) __attribute__((alias("thermal_genl_event_tz"))); int thermal_genl_event_tz_disable(struct param *p) __attribute__((alias("thermal_genl_event_tz"))); int thermal_genl_event_tz_trip_down(struct param *p) __attribute__((alias("thermal_genl_event_tz_trip_up"))); int thermal_genl_event_threshold_delete(struct param *p) __attribute__((alias("thermal_genl_event_threshold_add"))); int thermal_genl_event_threshold_down(struct param *p) __attribute__((alias("thermal_genl_event_threshold_up"))); static cb_t event_cb[] = { [THERMAL_GENL_EVENT_TZ_CREATE] = thermal_genl_event_tz_create, [THERMAL_GENL_EVENT_TZ_DELETE] = thermal_genl_event_tz_delete, [THERMAL_GENL_EVENT_TZ_ENABLE] = thermal_genl_event_tz_enable, [THERMAL_GENL_EVENT_TZ_DISABLE] = thermal_genl_event_tz_disable, [THERMAL_GENL_EVENT_TZ_TRIP_UP] = thermal_genl_event_tz_trip_up, [THERMAL_GENL_EVENT_TZ_TRIP_DOWN] = thermal_genl_event_tz_trip_down, [THERMAL_GENL_EVENT_TZ_TRIP_CHANGE] = thermal_genl_event_tz_trip_change, [THERMAL_GENL_EVENT_CDEV_ADD] = thermal_genl_event_cdev_add, [THERMAL_GENL_EVENT_CDEV_DELETE] = thermal_genl_event_cdev_delete, [THERMAL_GENL_EVENT_CDEV_STATE_UPDATE] = thermal_genl_event_cdev_state_update, [THERMAL_GENL_EVENT_TZ_GOV_CHANGE] = thermal_genl_event_gov_change, [THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE] = thermal_genl_event_cpu_capability_change, [THERMAL_GENL_EVENT_THRESHOLD_ADD] = thermal_genl_event_threshold_add, [THERMAL_GENL_EVENT_THRESHOLD_DELETE] = thermal_genl_event_threshold_delete, [THERMAL_GENL_EVENT_THRESHOLD_FLUSH] = thermal_genl_event_threshold_flush, [THERMAL_GENL_EVENT_THRESHOLD_DOWN] = thermal_genl_event_threshold_down, [THERMAL_GENL_EVENT_THRESHOLD_UP] = thermal_genl_event_threshold_up, }; /* * Generic netlink event encoding */ static int thermal_genl_send_event(enum thermal_genl_event event, struct param *p) { struct sk_buff *msg; int ret = -EMSGSIZE; void *hdr; if (!thermal_group_has_listeners(THERMAL_GENL_EVENT_GROUP)) return 0; msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg) return -ENOMEM; p->msg = msg; hdr = genlmsg_put(msg, 0, 0, &thermal_genl_family, 0, event); if (!hdr) goto out_free_msg; ret = event_cb[event](p); if (ret) goto out_cancel_msg; genlmsg_end(msg, hdr); genlmsg_multicast(&thermal_genl_family, msg, 0, THERMAL_GENL_EVENT_GROUP, GFP_KERNEL); return 0; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); return ret; } int thermal_notify_tz_create(const struct thermal_zone_device *tz) { struct param p = { .tz_id = tz->id, .name = tz->type }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_CREATE, &p); } int thermal_notify_tz_delete(const struct thermal_zone_device *tz) { struct param p = { .tz_id = tz->id }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_DELETE, &p); } int thermal_notify_tz_enable(const struct thermal_zone_device *tz) { struct param p = { .tz_id = tz->id }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_ENABLE, &p); } int thermal_notify_tz_disable(const struct thermal_zone_device *tz) { struct param p = { .tz_id = tz->id }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_DISABLE, &p); } int thermal_notify_tz_trip_down(const struct thermal_zone_device *tz, const struct thermal_trip *trip) { struct param p = { .tz_id = tz->id, .trip_id = thermal_zone_trip_id(tz, trip), .temp = tz->temperature }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_DOWN, &p); } int thermal_notify_tz_trip_up(const struct thermal_zone_device *tz, const struct thermal_trip *trip) { struct param p = { .tz_id = tz->id, .trip_id = thermal_zone_trip_id(tz, trip), .temp = tz->temperature }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_UP, &p); } int thermal_notify_tz_trip_change(const struct thermal_zone_device *tz, const struct thermal_trip *trip) { struct param p = { .tz_id = tz->id, .trip_id = thermal_zone_trip_id(tz, trip), .trip_type = trip->type, .trip_temp = trip->temperature, .trip_hyst = trip->hysteresis }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_TRIP_CHANGE, &p); } int thermal_notify_cdev_state_update(const struct thermal_cooling_device *cdev, int state) { struct param p = { .cdev_id = cdev->id, .cdev_state = state }; return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_STATE_UPDATE, &p); } int thermal_notify_cdev_add(const struct thermal_cooling_device *cdev) { struct param p = { .cdev_id = cdev->id, .name = cdev->type, .cdev_max_state = cdev->max_state }; return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_ADD, &p); } int thermal_notify_cdev_delete(const struct thermal_cooling_device *cdev) { struct param p = { .cdev_id = cdev->id }; return thermal_genl_send_event(THERMAL_GENL_EVENT_CDEV_DELETE, &p); } int thermal_notify_tz_gov_change(const struct thermal_zone_device *tz, const char *name) { struct param p = { .tz_id = tz->id, .name = name }; return thermal_genl_send_event(THERMAL_GENL_EVENT_TZ_GOV_CHANGE, &p); } int thermal_genl_cpu_capability_event(int count, struct thermal_genl_cpu_caps *caps) { struct param p = { .cpu_capabilities_count = count, .cpu_capabilities = caps }; return thermal_genl_send_event(THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE, &p); } EXPORT_SYMBOL_GPL(thermal_genl_cpu_capability_event); int thermal_notify_threshold_add(const struct thermal_zone_device *tz, int temperature, int direction) { struct param p = { .tz_id = tz->id, .temp = temperature, .direction = direction }; return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_ADD, &p); } int thermal_notify_threshold_delete(const struct thermal_zone_device *tz, int temperature, int direction) { struct param p = { .tz_id = tz->id, .temp = temperature, .direction = direction }; return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_DELETE, &p); } int thermal_notify_threshold_flush(const struct thermal_zone_device *tz) { struct param p = { .tz_id = tz->id }; return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_FLUSH, &p); } int thermal_notify_threshold_down(const struct thermal_zone_device *tz) { struct param p = { .tz_id = tz->id, .temp = tz->temperature, .prev_temp = tz->last_temperature }; return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_DOWN, &p); } int thermal_notify_threshold_up(const struct thermal_zone_device *tz) { struct param p = { .tz_id = tz->id, .temp = tz->temperature, .prev_temp = tz->last_temperature }; return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_UP, &p); } /*************************** Command encoding ********************************/ static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz, void *data) { struct sk_buff *msg = data; if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, tz->id) || nla_put_string(msg, THERMAL_GENL_ATTR_TZ_NAME, tz->type)) return -EMSGSIZE; return 0; } static int thermal_genl_cmd_tz_get_id(struct param *p) { struct sk_buff *msg = p->msg; struct nlattr *start_tz; int ret; start_tz = nla_nest_start(msg, THERMAL_GENL_ATTR_TZ); if (!start_tz) return -EMSGSIZE; ret = for_each_thermal_zone(__thermal_genl_cmd_tz_get_id, msg); if (ret) goto out_cancel_nest; nla_nest_end(msg, start_tz); return 0; out_cancel_nest: nla_nest_cancel(msg, start_tz); return ret; } static int thermal_genl_cmd_tz_get_trip(struct param *p) { struct sk_buff *msg = p->msg; const struct thermal_trip_desc *td; struct nlattr *start_trip; int id; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) return -EINVAL; id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); CLASS(thermal_zone_get_by_id, tz)(id); if (!tz) return -EINVAL; start_trip = nla_nest_start(msg, THERMAL_GENL_ATTR_TZ_TRIP); if (!start_trip) return -EMSGSIZE; guard(thermal_zone)(tz); for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID, thermal_zone_trip_id(tz, trip)) || nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, trip->type) || nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, trip->temperature) || nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, trip->hysteresis)) return -EMSGSIZE; } nla_nest_end(msg, start_trip); return 0; } static int thermal_genl_cmd_tz_get_temp(struct param *p) { struct sk_buff *msg = p->msg; int temp, ret, id; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) return -EINVAL; id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); CLASS(thermal_zone_get_by_id, tz)(id); if (!tz) return -EINVAL; ret = thermal_zone_get_temp(tz, &temp); if (ret) return ret; if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id) || nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TEMP, temp)) return -EMSGSIZE; return 0; } static int thermal_genl_cmd_tz_get_gov(struct param *p) { struct sk_buff *msg = p->msg; int id; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) return -EINVAL; id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); CLASS(thermal_zone_get_by_id, tz)(id); if (!tz) return -EINVAL; guard(thermal_zone)(tz); if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id) || nla_put_string(msg, THERMAL_GENL_ATTR_TZ_GOV_NAME, tz->governor->name)) return -EMSGSIZE; return 0; } static int __thermal_genl_cmd_cdev_get(struct thermal_cooling_device *cdev, void *data) { struct sk_buff *msg = data; if (nla_put_u32(msg, THERMAL_GENL_ATTR_CDEV_ID, cdev->id)) return -EMSGSIZE; if (nla_put_string(msg, THERMAL_GENL_ATTR_CDEV_NAME, cdev->type)) return -EMSGSIZE; return 0; } static int thermal_genl_cmd_cdev_get(struct param *p) { struct sk_buff *msg = p->msg; struct nlattr *start_cdev; int ret; start_cdev = nla_nest_start(msg, THERMAL_GENL_ATTR_CDEV); if (!start_cdev) return -EMSGSIZE; ret = for_each_thermal_cooling_device(__thermal_genl_cmd_cdev_get, msg); if (ret) goto out_cancel_nest; nla_nest_end(msg, start_cdev); return 0; out_cancel_nest: nla_nest_cancel(msg, start_cdev); return ret; } static int __thermal_genl_cmd_threshold_get(struct user_threshold *threshold, void *arg) { struct sk_buff *msg = arg; if (nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_TEMP, threshold->temperature) || nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, threshold->direction)) return -1; return 0; } static int thermal_genl_cmd_threshold_get(struct param *p) { struct sk_buff *msg = p->msg; struct nlattr *start_trip; int id, ret; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) return -EINVAL; id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); CLASS(thermal_zone_get_by_id, tz)(id); if (!tz) return -EINVAL; start_trip = nla_nest_start(msg, THERMAL_GENL_ATTR_THRESHOLD); if (!start_trip) return -EMSGSIZE; ret = thermal_thresholds_for_each(tz, __thermal_genl_cmd_threshold_get, msg); if (ret) return -EMSGSIZE; nla_nest_end(msg, start_trip); return 0; } static int thermal_genl_cmd_threshold_add(struct param *p) { int id, temp, direction; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID] || !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP] || !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]) return -EINVAL; id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); temp = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]); direction = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]); CLASS(thermal_zone_get_by_id, tz)(id); if (!tz) return -EINVAL; guard(thermal_zone)(tz); return thermal_thresholds_add(tz, temp, direction); } static int thermal_genl_cmd_threshold_delete(struct param *p) { int id, temp, direction; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID] || !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP] || !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]) return -EINVAL; id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); temp = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]); direction = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]); CLASS(thermal_zone_get_by_id, tz)(id); if (!tz) return -EINVAL; guard(thermal_zone)(tz); return thermal_thresholds_delete(tz, temp, direction); } static int thermal_genl_cmd_threshold_flush(struct param *p) { int id; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID]) return -EINVAL; id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]); CLASS(thermal_zone_get_by_id, tz)(id); if (!tz) return -EINVAL; guard(thermal_zone)(tz); thermal_thresholds_flush(tz); return 0; } static cb_t cmd_cb[] = { [THERMAL_GENL_CMD_TZ_GET_ID] = thermal_genl_cmd_tz_get_id, [THERMAL_GENL_CMD_TZ_GET_TRIP] = thermal_genl_cmd_tz_get_trip, [THERMAL_GENL_CMD_TZ_GET_TEMP] = thermal_genl_cmd_tz_get_temp, [THERMAL_GENL_CMD_TZ_GET_GOV] = thermal_genl_cmd_tz_get_gov, [THERMAL_GENL_CMD_CDEV_GET] = thermal_genl_cmd_cdev_get, [THERMAL_GENL_CMD_THRESHOLD_GET] = thermal_genl_cmd_threshold_get, [THERMAL_GENL_CMD_THRESHOLD_ADD] = thermal_genl_cmd_threshold_add, [THERMAL_GENL_CMD_THRESHOLD_DELETE] = thermal_genl_cmd_threshold_delete, [THERMAL_GENL_CMD_THRESHOLD_FLUSH] = thermal_genl_cmd_threshold_flush, }; static int thermal_genl_cmd_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct param p = { .msg = skb }; const struct genl_dumpit_info *info = genl_dumpit_info(cb); int cmd = info->op.cmd; int ret; void *hdr; hdr = genlmsg_put(skb, 0, 0, &thermal_genl_family, 0, cmd); if (!hdr) return -EMSGSIZE; ret = cmd_cb[cmd](&p); if (ret) goto out_cancel_msg; genlmsg_end(skb, hdr); return 0; out_cancel_msg: genlmsg_cancel(skb, hdr); return ret; } static int thermal_genl_cmd_doit(struct sk_buff *skb, struct genl_info *info) { struct param p = { .attrs = info->attrs }; struct sk_buff *msg; void *hdr; int cmd = info->genlhdr->cmd; int ret = -EMSGSIZE; msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg) return -ENOMEM; p.msg = msg; hdr = genlmsg_put_reply(msg, info, &thermal_genl_family, 0, cmd); if (!hdr) goto out_free_msg; ret = cmd_cb[cmd](&p); if (ret) goto out_cancel_msg; genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); return ret; } static int thermal_genl_bind(int mcgrp) { struct thermal_genl_notify n = { .mcgrp = mcgrp }; if (WARN_ON_ONCE(mcgrp > THERMAL_GENL_MAX_GROUP)) return -EINVAL; blocking_notifier_call_chain(&thermal_genl_chain, THERMAL_NOTIFY_BIND, &n); return 0; } static void thermal_genl_unbind(int mcgrp) { struct thermal_genl_notify n = { .mcgrp = mcgrp }; if (WARN_ON_ONCE(mcgrp > THERMAL_GENL_MAX_GROUP)) return; blocking_notifier_call_chain(&thermal_genl_chain, THERMAL_NOTIFY_UNBIND, &n); } static const struct genl_small_ops thermal_genl_ops[] = { { .cmd = THERMAL_GENL_CMD_TZ_GET_ID, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = thermal_genl_cmd_dumpit, }, { .cmd = THERMAL_GENL_CMD_TZ_GET_TRIP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = thermal_genl_cmd_doit, }, { .cmd = THERMAL_GENL_CMD_TZ_GET_TEMP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = thermal_genl_cmd_doit, }, { .cmd = THERMAL_GENL_CMD_TZ_GET_GOV, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = thermal_genl_cmd_doit, }, { .cmd = THERMAL_GENL_CMD_CDEV_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = thermal_genl_cmd_dumpit, }, { .cmd = THERMAL_GENL_CMD_THRESHOLD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = thermal_genl_cmd_doit, }, { .cmd = THERMAL_GENL_CMD_THRESHOLD_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = thermal_genl_cmd_doit, }, { .cmd = THERMAL_GENL_CMD_THRESHOLD_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = thermal_genl_cmd_doit, }, { .cmd = THERMAL_GENL_CMD_THRESHOLD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = thermal_genl_cmd_doit, }, }; static struct genl_family thermal_genl_family __ro_after_init = { .hdrsize = 0, .name = THERMAL_GENL_FAMILY_NAME, .version = THERMAL_GENL_VERSION, .maxattr = THERMAL_GENL_ATTR_MAX, .policy = thermal_genl_policy, .bind = thermal_genl_bind, .unbind = thermal_genl_unbind, .small_ops = thermal_genl_ops, .n_small_ops = ARRAY_SIZE(thermal_genl_ops), .resv_start_op = __THERMAL_GENL_CMD_MAX, .mcgrps = thermal_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(thermal_genl_mcgrps), }; int thermal_genl_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&thermal_genl_chain, nb); } int thermal_genl_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&thermal_genl_chain, nb); } int __init thermal_netlink_init(void) { return genl_register_family(&thermal_genl_family); } void __init thermal_netlink_exit(void) { genl_unregister_family(&thermal_genl_family); } |
214 214 215 216 214 216 215 214 213 217 212 217 213 213 210 2 2 2 2 2 2 2 1 1 2 2 3 3 2 2 213 212 213 215 1 210 212 211 212 215 212 209 212 214 212 215 212 211 208 210 215 21 21 2 2 2 2 215 21 211 123 209 208 118 92 3 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 | // SPDX-License-Identifier: GPL-2.0-only /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * * Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation * * ChangeLog: * * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> * Changed the compression method from stem compression to "table lookup" * compression (see scripts/kallsyms.c for a more complete description) */ #include <linux/kallsyms.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/kdb.h> #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ #include <linux/ctype.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/kprobes.h> #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/bsearch.h> #include <linux/btf_ids.h> #include "kallsyms_internal.h" /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result, size_t maxlen) { int len, skipped_first = 0; const char *tptr; const u8 *data; /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; len = *data; data++; off++; /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ if ((len & 0x80) != 0) { len = (len & 0x7F) | (*data << 7); data++; off++; } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ off += len; /* * For every byte on the compressed symbol data, copy the table * entry for that byte. */ while (len) { tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; data++; len--; while (*tptr) { if (skipped_first) { if (maxlen <= 1) goto tail; *result = *tptr; result++; maxlen--; } else skipped_first = 1; tptr++; } } tail: if (maxlen) *result = '\0'; /* Return to offset to the next symbol. */ return off; } /* * Get symbol type information. This is encoded as a single char at the * beginning of the symbol name. */ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, * and return the first char from this token. */ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[ |